1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i16_stride5_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 19; SSE-LABEL: load_i16_stride5_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: movdqa 16(%rdi), %xmm1 23; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 24; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 25; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 26; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 27; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 28; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 29; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] 30; SSE-NEXT: psrlq $48, %xmm0 31; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 32; SSE-NEXT: psrld $16, %xmm1 33; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 34; SSE-NEXT: movd %xmm2, (%rsi) 35; SSE-NEXT: movd %xmm3, (%rdx) 36; SSE-NEXT: movd %xmm4, (%rcx) 37; SSE-NEXT: movd %xmm0, (%r8) 38; SSE-NEXT: movd %xmm5, (%r9) 39; SSE-NEXT: retq 40; 41; AVX-LABEL: load_i16_stride5_vf2: 42; AVX: # %bb.0: 43; AVX-NEXT: vmovdqa (%rdi), %xmm0 44; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 45; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 46; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 47; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 48; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 49; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 50; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 51; AVX-NEXT: vpsrlq $48, %xmm0, %xmm5 52; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 53; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 54; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7] 55; AVX-NEXT: vmovd %xmm2, (%rsi) 56; AVX-NEXT: vmovd %xmm3, (%rdx) 57; AVX-NEXT: vmovd %xmm4, (%rcx) 58; AVX-NEXT: vmovd %xmm5, (%r8) 59; AVX-NEXT: vmovd %xmm0, (%r9) 60; AVX-NEXT: retq 61; 62; AVX2-LABEL: load_i16_stride5_vf2: 63; AVX2: # %bb.0: 64; AVX2-NEXT: vmovdqa (%rdi), %xmm0 65; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 66; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 67; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 68; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 69; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 70; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 71; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 72; AVX2-NEXT: vpsrlq $48, %xmm0, %xmm0 73; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 74; AVX2-NEXT: vpbroadcastw 8(%rdi), %xmm5 75; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 76; AVX2-NEXT: vmovd %xmm2, (%rsi) 77; AVX2-NEXT: vmovd %xmm3, (%rdx) 78; AVX2-NEXT: vmovd %xmm4, (%rcx) 79; AVX2-NEXT: vmovd %xmm0, (%r8) 80; AVX2-NEXT: vmovd %xmm1, (%r9) 81; AVX2-NEXT: retq 82; 83; AVX2-FP-LABEL: load_i16_stride5_vf2: 84; AVX2-FP: # %bb.0: 85; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 86; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 87; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 88; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 89; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 90; AVX2-FP-NEXT: vpsrlq $48, %xmm0, %xmm0 91; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 92; AVX2-FP-NEXT: vpbroadcastw 8(%rdi), %xmm5 93; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 94; AVX2-FP-NEXT: vmovd %xmm2, (%rsi) 95; AVX2-FP-NEXT: vmovd %xmm3, (%rdx) 96; AVX2-FP-NEXT: vmovd %xmm4, (%rcx) 97; AVX2-FP-NEXT: vmovd %xmm0, (%r8) 98; AVX2-FP-NEXT: vmovd %xmm1, (%r9) 99; AVX2-FP-NEXT: retq 100; 101; AVX2-FCP-LABEL: load_i16_stride5_vf2: 102; AVX2-FCP: # %bb.0: 103; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 104; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 105; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 106; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 107; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 108; AVX2-FCP-NEXT: vpsrlq $48, %xmm0, %xmm0 109; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 110; AVX2-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm5 111; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 112; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi) 113; AVX2-FCP-NEXT: vmovd %xmm3, (%rdx) 114; AVX2-FCP-NEXT: vmovd %xmm4, (%rcx) 115; AVX2-FCP-NEXT: vmovd %xmm0, (%r8) 116; AVX2-FCP-NEXT: vmovd %xmm1, (%r9) 117; AVX2-FCP-NEXT: retq 118; 119; AVX512-LABEL: load_i16_stride5_vf2: 120; AVX512: # %bb.0: 121; AVX512-NEXT: vmovdqa (%rdi), %xmm0 122; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 123; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 124; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 125; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 126; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 127; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 128; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 129; AVX512-NEXT: vpsrlq $48, %xmm0, %xmm0 130; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 131; AVX512-NEXT: vpbroadcastw 8(%rdi), %xmm5 132; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 133; AVX512-NEXT: vmovd %xmm2, (%rsi) 134; AVX512-NEXT: vmovd %xmm3, (%rdx) 135; AVX512-NEXT: vmovd %xmm4, (%rcx) 136; AVX512-NEXT: vmovd %xmm0, (%r8) 137; AVX512-NEXT: vmovd %xmm1, (%r9) 138; AVX512-NEXT: retq 139; 140; AVX512-FCP-LABEL: load_i16_stride5_vf2: 141; AVX512-FCP: # %bb.0: 142; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 143; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 144; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 145; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 146; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 147; AVX512-FCP-NEXT: vpsrlq $48, %xmm0, %xmm0 148; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 149; AVX512-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm5 150; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 151; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi) 152; AVX512-FCP-NEXT: vmovd %xmm3, (%rdx) 153; AVX512-FCP-NEXT: vmovd %xmm4, (%rcx) 154; AVX512-FCP-NEXT: vmovd %xmm0, (%r8) 155; AVX512-FCP-NEXT: vmovd %xmm1, (%r9) 156; AVX512-FCP-NEXT: retq 157; 158; AVX512DQ-LABEL: load_i16_stride5_vf2: 159; AVX512DQ: # %bb.0: 160; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 161; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 162; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 163; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 164; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 165; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 166; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 167; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 168; AVX512DQ-NEXT: vpsrlq $48, %xmm0, %xmm0 169; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 170; AVX512DQ-NEXT: vpbroadcastw 8(%rdi), %xmm5 171; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 172; AVX512DQ-NEXT: vmovd %xmm2, (%rsi) 173; AVX512DQ-NEXT: vmovd %xmm3, (%rdx) 174; AVX512DQ-NEXT: vmovd %xmm4, (%rcx) 175; AVX512DQ-NEXT: vmovd %xmm0, (%r8) 176; AVX512DQ-NEXT: vmovd %xmm1, (%r9) 177; AVX512DQ-NEXT: retq 178; 179; AVX512DQ-FCP-LABEL: load_i16_stride5_vf2: 180; AVX512DQ-FCP: # %bb.0: 181; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 182; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 183; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 184; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 185; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 186; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm0, %xmm0 187; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 188; AVX512DQ-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm5 189; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 190; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi) 191; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rdx) 192; AVX512DQ-FCP-NEXT: vmovd %xmm4, (%rcx) 193; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%r8) 194; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%r9) 195; AVX512DQ-FCP-NEXT: retq 196; 197; AVX512BW-LABEL: load_i16_stride5_vf2: 198; AVX512BW: # %bb.0: 199; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 200; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1 201; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 202; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 203; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 204; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 205; AVX512BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 206; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 207; AVX512BW-NEXT: vpsrlq $48, %xmm0, %xmm0 208; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 209; AVX512BW-NEXT: vpbroadcastw 8(%rdi), %xmm5 210; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 211; AVX512BW-NEXT: vmovd %xmm2, (%rsi) 212; AVX512BW-NEXT: vmovd %xmm3, (%rdx) 213; AVX512BW-NEXT: vmovd %xmm4, (%rcx) 214; AVX512BW-NEXT: vmovd %xmm0, (%r8) 215; AVX512BW-NEXT: vmovd %xmm1, (%r9) 216; AVX512BW-NEXT: retq 217; 218; AVX512BW-FCP-LABEL: load_i16_stride5_vf2: 219; AVX512BW-FCP: # %bb.0: 220; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 221; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 222; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 223; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 224; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 225; AVX512BW-FCP-NEXT: vpsrlq $48, %xmm0, %xmm0 226; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 227; AVX512BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm5 228; AVX512BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 229; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi) 230; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rdx) 231; AVX512BW-FCP-NEXT: vmovd %xmm4, (%rcx) 232; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r8) 233; AVX512BW-FCP-NEXT: vmovd %xmm1, (%r9) 234; AVX512BW-FCP-NEXT: retq 235; 236; AVX512DQ-BW-LABEL: load_i16_stride5_vf2: 237; AVX512DQ-BW: # %bb.0: 238; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 239; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1 240; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 241; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 242; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,3,2,3] 243; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 244; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[3,1,2,3] 245; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7] 246; AVX512DQ-BW-NEXT: vpsrlq $48, %xmm0, %xmm0 247; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 248; AVX512DQ-BW-NEXT: vpbroadcastw 8(%rdi), %xmm5 249; AVX512DQ-BW-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 250; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi) 251; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rdx) 252; AVX512DQ-BW-NEXT: vmovd %xmm4, (%rcx) 253; AVX512DQ-BW-NEXT: vmovd %xmm0, (%r8) 254; AVX512DQ-BW-NEXT: vmovd %xmm1, (%r9) 255; AVX512DQ-BW-NEXT: retq 256; 257; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf2: 258; AVX512DQ-BW-FCP: # %bb.0: 259; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 260; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 261; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 262; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 263; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 264; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %xmm0, %xmm0 265; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 266; AVX512DQ-BW-FCP-NEXT: vpbroadcastw 8(%rdi), %xmm5 267; AVX512DQ-BW-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm5[0],xmm1[1],xmm5[2,3,4,5,6,7] 268; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi) 269; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rdx) 270; AVX512DQ-BW-FCP-NEXT: vmovd %xmm4, (%rcx) 271; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r8) 272; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%r9) 273; AVX512DQ-BW-FCP-NEXT: retq 274 %wide.vec = load <10 x i16>, ptr %in.vec, align 64 275 %strided.vec0 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 0, i32 5> 276 %strided.vec1 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 1, i32 6> 277 %strided.vec2 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 2, i32 7> 278 %strided.vec3 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 3, i32 8> 279 %strided.vec4 = shufflevector <10 x i16> %wide.vec, <10 x i16> poison, <2 x i32> <i32 4, i32 9> 280 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64 281 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64 282 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64 283 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64 284 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64 285 ret void 286} 287 288define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 289; SSE-LABEL: load_i16_stride5_vf4: 290; SSE: # %bb.0: 291; SSE-NEXT: movdqa (%rdi), %xmm2 292; SSE-NEXT: movdqa 16(%rdi), %xmm3 293; SSE-NEXT: movdqa 32(%rdi), %xmm0 294; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3] 295; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[2,1,2,3,4,5,6,7] 296; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,2,2,3] 297; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7] 298; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 299; SSE-NEXT: movdqa %xmm3, %xmm4 300; SSE-NEXT: psrlq $48, %xmm4 301; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,3,2,3] 302; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] 303; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 304; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 305; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,3,2,1] 306; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] 307; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 308; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7] 309; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,1,3] 310; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 311; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] 312; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] 313; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,1,2,0] 314; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7] 315; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3] 316; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,2,0,3,4,5,6,7] 317; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[1,1,1,1] 318; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,2,2,3] 319; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 320; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm7[0,3,2,3,4,5,6,7] 321; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 322; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,0,3,3,4,5,6,7] 323; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] 324; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] 325; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 326; SSE-NEXT: movdqa {{.*#+}} xmm7 = [65535,65535,65535,0,65535,65535,65535,65535] 327; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,0],xmm3[3,0] 328; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2] 329; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] 330; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 331; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 332; SSE-NEXT: pand %xmm7, %xmm2 333; SSE-NEXT: pandn %xmm0, %xmm7 334; SSE-NEXT: por %xmm2, %xmm7 335; SSE-NEXT: movq %xmm1, (%rsi) 336; SSE-NEXT: movq %xmm4, (%rdx) 337; SSE-NEXT: movq %xmm5, (%rcx) 338; SSE-NEXT: movq %xmm6, (%r8) 339; SSE-NEXT: movq %xmm7, (%r9) 340; SSE-NEXT: retq 341; 342; AVX-LABEL: load_i16_stride5_vf4: 343; AVX: # %bb.0: 344; AVX-NEXT: vpshufd {{.*#+}} xmm0 = mem[3,1,2,3] 345; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7] 346; AVX-NEXT: vmovdqa (%rdi), %xmm1 347; AVX-NEXT: vmovdqa 16(%rdi), %xmm2 348; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 349; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3] 350; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 351; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1] 352; AVX-NEXT: vpsrlq $48, %xmm2, %xmm4 353; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,3,2,3] 354; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,2,2,3,4,5,6,7] 355; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 356; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 357; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,4,5,8,9,2,3,u,u,u,u,u,u,u,u] 358; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[0,1,1,3] 359; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 360; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm5[2],xmm2[2],xmm5[3],xmm2[3] 361; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3] 362; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,8,9,6,7,u,u,u,u,u,u,u,u] 363; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0,1],xmm1[2,3],xmm2[4,5,6,7] 364; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,10,11,u,u,u,u,u,u,u,u] 365; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3] 366; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,2,2,3,4,5,6,7] 367; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3] 368; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5],xmm2[6,7] 369; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 370; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3],xmm1[4,5,6,7] 371; AVX-NEXT: vmovq %xmm0, (%rsi) 372; AVX-NEXT: vmovq %xmm4, (%rdx) 373; AVX-NEXT: vmovq %xmm5, (%rcx) 374; AVX-NEXT: vmovq %xmm6, (%r8) 375; AVX-NEXT: vmovq %xmm1, (%r9) 376; AVX-NEXT: retq 377; 378; AVX2-LABEL: load_i16_stride5_vf4: 379; AVX2: # %bb.0: 380; AVX2-NEXT: vmovdqa (%rdi), %xmm0 381; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 382; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 383; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3] 384; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 385; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] 386; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 387; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 388; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] 389; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 390; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] 391; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 392; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] 393; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 394; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 395; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] 396; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 397; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 398; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 399; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 400; AVX2-NEXT: vmovq %xmm3, (%rsi) 401; AVX2-NEXT: vmovq %xmm4, (%rdx) 402; AVX2-NEXT: vmovq %xmm5, (%rcx) 403; AVX2-NEXT: vmovq %xmm6, (%r8) 404; AVX2-NEXT: vmovq %xmm0, (%r9) 405; AVX2-NEXT: retq 406; 407; AVX2-FP-LABEL: load_i16_stride5_vf4: 408; AVX2-FP: # %bb.0: 409; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 410; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1 411; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2 412; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 413; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 414; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 415; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] 416; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 417; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] 418; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 419; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] 420; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 421; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 422; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] 423; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 424; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 425; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 426; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 427; AVX2-FP-NEXT: vmovq %xmm3, (%rsi) 428; AVX2-FP-NEXT: vmovq %xmm4, (%rdx) 429; AVX2-FP-NEXT: vmovq %xmm5, (%rcx) 430; AVX2-FP-NEXT: vmovq %xmm6, (%r8) 431; AVX2-FP-NEXT: vmovq %xmm0, (%r9) 432; AVX2-FP-NEXT: retq 433; 434; AVX2-FCP-LABEL: load_i16_stride5_vf4: 435; AVX2-FCP: # %bb.0: 436; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 437; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 438; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 439; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm1[4,5,14,15,u,u,u,u,u,u,u,u,u,u,u,u] 440; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 441; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 442; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm2[0],xmm0[1,2,3,4,5,6,7] 443; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3] 444; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,u,u,u,u,u,u,u,u] 445; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 446; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm1[2],xmm5[3] 447; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 448; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 449; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0],xmm6[1],xmm1[2,3] 450; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 451; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3] 452; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 453; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 454; AVX2-FCP-NEXT: vmovq %xmm3, (%rsi) 455; AVX2-FCP-NEXT: vmovq %xmm4, (%rdx) 456; AVX2-FCP-NEXT: vmovq %xmm5, (%rcx) 457; AVX2-FCP-NEXT: vmovq %xmm6, (%r8) 458; AVX2-FCP-NEXT: vmovq %xmm0, (%r9) 459; AVX2-FCP-NEXT: retq 460; 461; AVX512-LABEL: load_i16_stride5_vf4: 462; AVX512: # %bb.0: 463; AVX512-NEXT: vmovdqa (%rdi), %xmm0 464; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 465; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 466; AVX512-NEXT: vpextrw $5, %xmm0, %eax 467; AVX512-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 468; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] 469; AVX512-NEXT: vpextrw $7, %xmm1, %eax 470; AVX512-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 471; AVX512-NEXT: vpextrw $6, %xmm0, %eax 472; AVX512-NEXT: vpextrw $1, %xmm0, %r10d 473; AVX512-NEXT: vmovd %r10d, %xmm4 474; AVX512-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 475; AVX512-NEXT: vpextrw $3, %xmm1, %eax 476; AVX512-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 477; AVX512-NEXT: vmovd %xmm2, %eax 478; AVX512-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 479; AVX512-NEXT: vmovdqa 16(%rdi), %xmm4 480; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 481; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] 482; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 483; AVX512-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 484; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] 485; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 486; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] 487; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 488; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 489; AVX512-NEXT: vmovq %xmm3, (%rsi) 490; AVX512-NEXT: vmovq %xmm1, (%rdx) 491; AVX512-NEXT: vmovq %xmm5, (%rcx) 492; AVX512-NEXT: vmovq %xmm6, (%r8) 493; AVX512-NEXT: vmovq %xmm0, (%r9) 494; AVX512-NEXT: retq 495; 496; AVX512-FCP-LABEL: load_i16_stride5_vf4: 497; AVX512-FCP: # %bb.0: 498; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 499; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 500; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 501; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 502; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] 503; AVX512-FCP-NEXT: vpextrw $7, %xmm1, %eax 504; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 505; AVX512-FCP-NEXT: vpextrw $3, %xmm1, %eax 506; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 507; AVX512-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 508; AVX512-FCP-NEXT: vmovd %xmm2, %eax 509; AVX512-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 510; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 511; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 512; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] 513; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 514; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 515; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] 516; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 517; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] 518; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 519; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 520; AVX512-FCP-NEXT: vmovq %xmm3, (%rsi) 521; AVX512-FCP-NEXT: vmovq %xmm1, (%rdx) 522; AVX512-FCP-NEXT: vmovq %xmm5, (%rcx) 523; AVX512-FCP-NEXT: vmovq %xmm6, (%r8) 524; AVX512-FCP-NEXT: vmovq %xmm0, (%r9) 525; AVX512-FCP-NEXT: retq 526; 527; AVX512DQ-LABEL: load_i16_stride5_vf4: 528; AVX512DQ: # %bb.0: 529; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 530; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1 531; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 532; AVX512DQ-NEXT: vpextrw $5, %xmm0, %eax 533; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm0, %xmm3 534; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] 535; AVX512DQ-NEXT: vpextrw $7, %xmm1, %eax 536; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 537; AVX512DQ-NEXT: vpextrw $6, %xmm0, %eax 538; AVX512DQ-NEXT: vpextrw $1, %xmm0, %r10d 539; AVX512DQ-NEXT: vmovd %r10d, %xmm4 540; AVX512DQ-NEXT: vpinsrw $1, %eax, %xmm4, %xmm4 541; AVX512DQ-NEXT: vpextrw $3, %xmm1, %eax 542; AVX512DQ-NEXT: vpinsrw $2, %eax, %xmm4, %xmm1 543; AVX512DQ-NEXT: vmovd %xmm2, %eax 544; AVX512DQ-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 545; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm4 546; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 547; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] 548; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 549; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 550; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] 551; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 552; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] 553; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 554; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 555; AVX512DQ-NEXT: vmovq %xmm3, (%rsi) 556; AVX512DQ-NEXT: vmovq %xmm1, (%rdx) 557; AVX512DQ-NEXT: vmovq %xmm5, (%rcx) 558; AVX512DQ-NEXT: vmovq %xmm6, (%r8) 559; AVX512DQ-NEXT: vmovq %xmm0, (%r9) 560; AVX512DQ-NEXT: retq 561; 562; AVX512DQ-FCP-LABEL: load_i16_stride5_vf4: 563; AVX512DQ-FCP: # %bb.0: 564; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 565; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 566; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2 567; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,1,10,11,u,u,u,u,u,u,u,u,u,u,u,u] 568; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] 569; AVX512DQ-FCP-NEXT: vpextrw $7, %xmm1, %eax 570; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm3, %xmm3 571; AVX512DQ-FCP-NEXT: vpextrw $3, %xmm1, %eax 572; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,12,13,u,u,u,u,u,u,u,u,u,u,u,u] 573; AVX512DQ-FCP-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 574; AVX512DQ-FCP-NEXT: vmovd %xmm2, %eax 575; AVX512DQ-FCP-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 576; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm4 577; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0],xmm0[1,2,3] 578; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm4[2],xmm5[3] 579; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,u,u,u,u,u,u,u,u] 580; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0,1],xmm2[2],xmm0[3,4,5,6,7] 581; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0],xmm6[1],xmm4[2,3] 582; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,u,u,u,u,u,u,u,u] 583; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2],xmm4[3] 584; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 585; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5,6,7] 586; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rsi) 587; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rdx) 588; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%rcx) 589; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r8) 590; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%r9) 591; AVX512DQ-FCP-NEXT: retq 592; 593; AVX512BW-LABEL: load_i16_stride5_vf4: 594; AVX512BW: # %bb.0: 595; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] 596; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 597; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 598; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] 599; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 600; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3 601; AVX512BW-NEXT: vpextrw $7, %xmm3, %eax 602; AVX512BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 603; AVX512BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 604; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] 605; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 606; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] 607; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 608; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] 609; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 610; AVX512BW-NEXT: vmovq %xmm2, (%rsi) 611; AVX512BW-NEXT: vmovq %xmm0, (%rdx) 612; AVX512BW-NEXT: vmovq %xmm3, (%rcx) 613; AVX512BW-NEXT: vmovq %xmm4, (%r8) 614; AVX512BW-NEXT: vmovq %xmm1, (%r9) 615; AVX512BW-NEXT: vzeroupper 616; AVX512BW-NEXT: retq 617; 618; AVX512BW-FCP-LABEL: load_i16_stride5_vf4: 619; AVX512BW-FCP: # %bb.0: 620; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] 621; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 622; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 623; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] 624; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 625; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 626; AVX512BW-FCP-NEXT: vpextrw $7, %xmm3, %eax 627; AVX512BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 628; AVX512BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 629; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] 630; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 631; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] 632; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 633; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] 634; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 635; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) 636; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) 637; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx) 638; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8) 639; AVX512BW-FCP-NEXT: vmovq %xmm1, (%r9) 640; AVX512BW-FCP-NEXT: vzeroupper 641; AVX512BW-FCP-NEXT: retq 642; 643; AVX512DQ-BW-LABEL: load_i16_stride5_vf4: 644; AVX512DQ-BW: # %bb.0: 645; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] 646; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 647; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0 648; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] 649; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2 650; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3 651; AVX512DQ-BW-NEXT: vpextrw $7, %xmm3, %eax 652; AVX512DQ-BW-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 653; AVX512DQ-BW-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 654; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] 655; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3 656; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] 657; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4 658; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] 659; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm1 660; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi) 661; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) 662; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx) 663; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8) 664; AVX512DQ-BW-NEXT: vmovq %xmm1, (%r9) 665; AVX512DQ-BW-NEXT: vzeroupper 666; AVX512DQ-BW-NEXT: retq 667; 668; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf4: 669; AVX512DQ-BW-FCP: # %bb.0: 670; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0] 671; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 672; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0 673; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0] 674; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2 675; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 676; AVX512DQ-BW-FCP-NEXT: vpextrw $7, %xmm3, %eax 677; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, %eax, %xmm2, %xmm2 678; AVX512DQ-BW-FCP-NEXT: vpinsrw $3, 32(%rdi), %xmm0, %xmm0 679; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,7,12,17,0,0,0,0] 680; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3 681; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,8,13,18,0,0,0,0] 682; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4 683; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,9,14,19,0,0,0,0] 684; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm1 685; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) 686; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) 687; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx) 688; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8) 689; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%r9) 690; AVX512DQ-BW-FCP-NEXT: vzeroupper 691; AVX512DQ-BW-FCP-NEXT: retq 692 %wide.vec = load <20 x i16>, ptr %in.vec, align 64 693 %strided.vec0 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15> 694 %strided.vec1 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16> 695 %strided.vec2 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 2, i32 7, i32 12, i32 17> 696 %strided.vec3 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 3, i32 8, i32 13, i32 18> 697 %strided.vec4 = shufflevector <20 x i16> %wide.vec, <20 x i16> poison, <4 x i32> <i32 4, i32 9, i32 14, i32 19> 698 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64 699 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64 700 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64 701 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64 702 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64 703 ret void 704} 705 706define void @load_i16_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 707; SSE-LABEL: load_i16_stride5_vf8: 708; SSE: # %bb.0: 709; SSE-NEXT: movdqa 64(%rdi), %xmm6 710; SSE-NEXT: movdqa (%rdi), %xmm4 711; SSE-NEXT: movdqa 16(%rdi), %xmm3 712; SSE-NEXT: movdqa 32(%rdi), %xmm0 713; SSE-NEXT: movdqa 48(%rdi), %xmm5 714; SSE-NEXT: movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,65535] 715; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] 716; SSE-NEXT: pand %xmm1, %xmm2 717; SSE-NEXT: pandn %xmm0, %xmm1 718; SSE-NEXT: por %xmm2, %xmm1 719; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3] 720; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] 721; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[0,2,2,3] 722; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[0,3,2,3,4,5,6,7] 723; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1] 724; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3] 725; SSE-NEXT: movaps {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,65535,65535,0] 726; SSE-NEXT: andps %xmm1, %xmm7 727; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,0,1] 728; SSE-NEXT: movaps %xmm1, %xmm2 729; SSE-NEXT: pandn %xmm8, %xmm2 730; SSE-NEXT: por %xmm7, %xmm2 731; SSE-NEXT: movdqa %xmm3, %xmm7 732; SSE-NEXT: psrlq $48, %xmm7 733; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,3,2,3] 734; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] 735; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1] 736; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,0,0,65535,65535,65535,65535,65535] 737; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,3,2,3] 738; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3] 739; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1] 740; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7] 741; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,2,1] 742; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,0,0,0,4,5,6,7] 743; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,6,4,7] 744; SSE-NEXT: pand %xmm7, %xmm9 745; SSE-NEXT: pandn %xmm8, %xmm7 746; SSE-NEXT: por %xmm9, %xmm7 747; SSE-NEXT: pand %xmm1, %xmm7 748; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm6[0,1,2,0] 749; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] 750; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[0,1,1,3] 751; SSE-NEXT: psllq $48, %xmm6 752; SSE-NEXT: pandn %xmm6, %xmm1 753; SSE-NEXT: por %xmm7, %xmm1 754; SSE-NEXT: movdqa %xmm5, %xmm7 755; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3] 756; SSE-NEXT: movdqa %xmm5, %xmm12 757; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[1,0],xmm0[0,0] 758; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm0[2,3] 759; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,0,1,3] 760; SSE-NEXT: movdqa {{.*#+}} xmm13 = [65535,65535,65535,0,0,0,65535,65535] 761; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[0,1,1,3] 762; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 763; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm3[2],xmm5[3],xmm3[3] 764; SSE-NEXT: pand %xmm13, %xmm5 765; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[1,1,1,1] 766; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3] 767; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm11[0],xmm14[1],xmm11[1] 768; SSE-NEXT: movdqa %xmm13, %xmm15 769; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,3,2,3,4,5,6,7] 770; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3] 771; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm11[1,0,3,3,4,5,6,7] 772; SSE-NEXT: pand %xmm13, %xmm11 773; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm3[3,0] 774; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0,2] 775; SSE-NEXT: movdqa %xmm13, %xmm4 776; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7] 777; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 778; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[2,1,3,3,4,5,6,7] 779; SSE-NEXT: pand %xmm13, %xmm3 780; SSE-NEXT: pandn %xmm12, %xmm13 781; SSE-NEXT: por %xmm13, %xmm5 782; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,6,5,6,7] 783; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] 784; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm12[2,3] 785; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm8[2,0] 786; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm0[2,2,2,2,4,5,6,7] 787; SSE-NEXT: pandn %xmm8, %xmm15 788; SSE-NEXT: por %xmm15, %xmm11 789; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm0[3,0] 790; SSE-NEXT: pandn %xmm0, %xmm4 791; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[0,2] 792; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,4,6,7] 793; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,4,5,5,6] 794; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[3,1],xmm0[2,3] 795; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm7[2,0] 796; SSE-NEXT: por %xmm4, %xmm3 797; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,0,3,4,5,6,7] 798; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm10[0,1,2,3,4,5,4,7] 799; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm0[1,3] 800; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] 801; SSE-NEXT: movdqa %xmm2, (%rsi) 802; SSE-NEXT: movdqa %xmm1, (%rdx) 803; SSE-NEXT: movaps %xmm5, (%rcx) 804; SSE-NEXT: movaps %xmm11, (%r8) 805; SSE-NEXT: movaps %xmm3, (%r9) 806; SSE-NEXT: retq 807; 808; AVX-LABEL: load_i16_stride5_vf8: 809; AVX: # %bb.0: 810; AVX-NEXT: vmovdqa (%rdi), %xmm0 811; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 812; AVX-NEXT: vmovdqa 32(%rdi), %xmm2 813; AVX-NEXT: vmovdqa 48(%rdi), %xmm3 814; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[0,1,0,3] 815; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4],xmm4[5,6,7] 816; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[3,1,2,3] 817; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] 818; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[0,2,2,3] 819; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,3,2,3,4,5,6,7] 820; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 821; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] 822; AVX-NEXT: vmovdqa 64(%rdi), %xmm5 823; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] 824; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm6[7] 825; AVX-NEXT: vpsrlq $48, %xmm1, %xmm6 826; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,3,2,3] 827; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,2,2,3,4,5,6,7] 828; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] 829; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7] 830; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,0,1,10,11,4,5,14,15,u,u] 831; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4,5,6,7] 832; AVX-NEXT: vpsllq $48, %xmm5, %xmm7 833; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6],xmm7[7] 834; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[0,1,1,3] 835; AVX-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] 836; AVX-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm1[2],xmm7[3],xmm1[3] 837; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6,7] 838; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] 839; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4,5],xmm7[6,7] 840; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[0,1,2,0] 841; AVX-NEXT: vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,5] 842; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5],xmm8[6,7] 843; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 844; AVX-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6,7,0,1,10,11,u,u,u,u,u,u,12,13,14,15] 845; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] 846; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] 847; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] 848; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4,5],xmm8[6,7] 849; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,1,0,3] 850; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6] 851; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5],xmm9[6,7] 852; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,1,1] 853; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 854; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] 855; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 856; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] 857; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,u,u,u,u,u,u,12,13,14,15] 858; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3,4,5],xmm0[6,7] 859; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,1,1,3] 860; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7] 861; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] 862; AVX-NEXT: vmovdqa %xmm4, (%rsi) 863; AVX-NEXT: vmovdqa %xmm6, (%rdx) 864; AVX-NEXT: vmovdqa %xmm7, (%rcx) 865; AVX-NEXT: vmovdqa %xmm8, (%r8) 866; AVX-NEXT: vmovdqa %xmm0, (%r9) 867; AVX-NEXT: retq 868; 869; AVX2-LABEL: load_i16_stride5_vf8: 870; AVX2: # %bb.0: 871; AVX2-NEXT: vmovdqa (%rdi), %ymm0 872; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 873; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 874; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 875; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] 876; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 877; AVX2-NEXT: vpbroadcastw 70(%rdi), %xmm3 878; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] 879; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] 880; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 881; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] 882; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 883; AVX2-NEXT: vmovdqa 64(%rdi), %xmm4 884; AVX2-NEXT: vpsllq $48, %xmm4, %xmm5 885; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] 886; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] 887; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 888; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] 889; AVX2-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 890; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,1,2,0] 891; AVX2-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] 892; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 893; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] 894; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 895; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 896; AVX2-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 897; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,1,0,3] 898; AVX2-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] 899; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 900; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 901; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 902; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 903; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 904; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] 905; AVX2-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 906; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] 907; AVX2-NEXT: vmovdqa %xmm1, (%rsi) 908; AVX2-NEXT: vmovdqa %xmm3, (%rdx) 909; AVX2-NEXT: vmovdqa %xmm5, (%rcx) 910; AVX2-NEXT: vmovdqa %xmm6, (%r8) 911; AVX2-NEXT: vmovdqa %xmm0, (%r9) 912; AVX2-NEXT: vzeroupper 913; AVX2-NEXT: retq 914; 915; AVX2-FP-LABEL: load_i16_stride5_vf8: 916; AVX2-FP: # %bb.0: 917; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 918; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 919; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 920; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 921; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 922; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 923; AVX2-FP-NEXT: vpbroadcastw 70(%rdi), %xmm3 924; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] 925; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 926; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4 927; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] 928; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 929; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm4 930; AVX2-FP-NEXT: vpsllq $48, %xmm4, %xmm5 931; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] 932; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] 933; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 934; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 935; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 936; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] 937; AVX2-FP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 938; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 939; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 940; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm7 941; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 942; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 943; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] 944; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 945; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 946; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 947; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 948; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 949; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm1 950; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 951; AVX2-FP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 952; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] 953; AVX2-FP-NEXT: vmovdqa %xmm2, (%rsi) 954; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx) 955; AVX2-FP-NEXT: vmovdqa %xmm5, (%rcx) 956; AVX2-FP-NEXT: vmovdqa %xmm6, (%r8) 957; AVX2-FP-NEXT: vmovdqa %xmm0, (%r9) 958; AVX2-FP-NEXT: vzeroupper 959; AVX2-FP-NEXT: retq 960; 961; AVX2-FCP-LABEL: load_i16_stride5_vf8: 962; AVX2-FCP: # %bb.0: 963; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 964; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 965; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 966; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 967; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 968; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 969; AVX2-FCP-NEXT: vpbroadcastw 70(%rdi), %xmm3 970; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] 971; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 972; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 973; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] 974; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 975; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 976; AVX2-FCP-NEXT: vpsllq $48, %xmm4, %xmm5 977; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6],xmm5[7] 978; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] 979; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6 980; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 981; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 982; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] 983; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 984; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 985; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 986; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm4, %xmm7 987; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 988; AVX2-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 989; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] 990; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 991; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 992; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 993; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 994; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 995; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 996; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 997; AVX2-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 998; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3] 999; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1000; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx) 1001; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rcx) 1002; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r8) 1003; AVX2-FCP-NEXT: vmovdqa %xmm0, (%r9) 1004; AVX2-FCP-NEXT: vzeroupper 1005; AVX2-FCP-NEXT: retq 1006; 1007; AVX512-LABEL: load_i16_stride5_vf8: 1008; AVX512: # %bb.0: 1009; AVX512-NEXT: vmovdqa (%rdi), %ymm0 1010; AVX512-NEXT: vmovdqa 32(%rdi), %ymm2 1011; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 1012; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 1013; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] 1014; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 1015; AVX512-NEXT: vpbroadcastw 70(%rdi), %xmm3 1016; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] 1017; AVX512-NEXT: vmovdqa 64(%rdi), %xmm3 1018; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] 1019; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 1020; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] 1021; AVX512-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 1022; AVX512-NEXT: vpsllq $48, %xmm3, %xmm5 1023; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] 1024; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] 1025; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 1026; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] 1027; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 1028; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0] 1029; AVX512-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] 1030; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1031; AVX512-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] 1032; AVX512-NEXT: vextracti128 $1, %ymm6, %xmm7 1033; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 1034; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 1035; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] 1036; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] 1037; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 1038; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 1039; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm2 1040; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 1041; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 1042; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] 1043; AVX512-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 1044; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] 1045; AVX512-NEXT: vmovdqa %xmm1, (%rsi) 1046; AVX512-NEXT: vmovdqa %xmm4, (%rdx) 1047; AVX512-NEXT: vmovdqa %xmm5, (%rcx) 1048; AVX512-NEXT: vmovdqa %xmm6, (%r8) 1049; AVX512-NEXT: vmovdqa %xmm0, (%r9) 1050; AVX512-NEXT: vzeroupper 1051; AVX512-NEXT: retq 1052; 1053; AVX512-FCP-LABEL: load_i16_stride5_vf8: 1054; AVX512-FCP: # %bb.0: 1055; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 1056; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1057; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1058; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1059; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 1060; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 1061; AVX512-FCP-NEXT: vpbroadcastw 70(%rdi), %xmm3 1062; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] 1063; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 1064; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1065; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 1066; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] 1067; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 1068; AVX512-FCP-NEXT: vpsllq $48, %xmm3, %xmm5 1069; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] 1070; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] 1071; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 1072; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 1073; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 1074; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] 1075; AVX512-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 1076; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1077; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 1078; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 1079; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 1080; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 1081; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] 1082; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 1083; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 1084; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 1085; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 1086; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1087; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1088; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 1089; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 1090; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] 1091; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1092; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rdx) 1093; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rcx) 1094; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r8) 1095; AVX512-FCP-NEXT: vmovdqa %xmm0, (%r9) 1096; AVX512-FCP-NEXT: vzeroupper 1097; AVX512-FCP-NEXT: retq 1098; 1099; AVX512DQ-LABEL: load_i16_stride5_vf8: 1100; AVX512DQ: # %bb.0: 1101; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 1102; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm2 1103; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15] 1104; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 1105; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] 1106; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 1107; AVX512DQ-NEXT: vpbroadcastw 70(%rdi), %xmm3 1108; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7] 1109; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm3 1110; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4,5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10],ymm0[11],ymm2[12,13],ymm0[14],ymm2[15] 1111; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 1112; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] 1113; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 1114; AVX512DQ-NEXT: vpsllq $48, %xmm3, %xmm5 1115; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] 1116; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1],ymm0[2],ymm2[3],ymm0[4],ymm2[5,6],ymm0[7],ymm2[8,9],ymm0[10],ymm2[11],ymm0[12],ymm2[13,14],ymm0[15] 1117; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 1118; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3,4],xmm5[5,6,7] 1119; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 1120; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,1,2,0] 1121; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,5,6,5] 1122; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1123; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm6 = ymm0[0],ymm2[1,2],ymm0[3],ymm2[4],ymm0[5],ymm2[6,7],ymm0[8],ymm2[9,10],ymm0[11],ymm2[12],ymm0[13],ymm2[14,15] 1124; AVX512DQ-NEXT: vextracti128 $1, %ymm6, %xmm7 1125; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 1126; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 1127; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,1,0,3] 1128; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,5,6] 1129; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 1130; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1],ymm2[2,3],ymm0[4],ymm2[5],ymm0[6],ymm2[7,8],ymm0[9],ymm2[10,11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] 1131; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm2 1132; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 1133; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 1134; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,1,3] 1135; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 1136; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3] 1137; AVX512DQ-NEXT: vmovdqa %xmm1, (%rsi) 1138; AVX512DQ-NEXT: vmovdqa %xmm4, (%rdx) 1139; AVX512DQ-NEXT: vmovdqa %xmm5, (%rcx) 1140; AVX512DQ-NEXT: vmovdqa %xmm6, (%r8) 1141; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9) 1142; AVX512DQ-NEXT: vzeroupper 1143; AVX512DQ-NEXT: retq 1144; 1145; AVX512DQ-FCP-LABEL: load_i16_stride5_vf8: 1146; AVX512DQ-FCP: # %bb.0: 1147; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 1148; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1149; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1150; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 1151; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 1152; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,10,11,4,5,14,15,8,9,2,3,12,13,u,u] 1153; AVX512DQ-FCP-NEXT: vpbroadcastw 70(%rdi), %xmm3 1154; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6],xmm3[7] 1155; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm3 1156; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1157; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 1158; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] 1159; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[2,3,12,13,6,7,0,1,10,11,4,5,14,15,u,u] 1160; AVX512DQ-FCP-NEXT: vpsllq $48, %xmm3, %xmm5 1161; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm5[7] 1162; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [4,5,14,15,8,9,2,3,12,13,6,7,0,1,10,11] 1163; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm6 1164; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 1165; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 1166; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] 1167; AVX512DQ-FCP-NEXT: vpshufb %xmm5, %xmm7, %xmm5 1168; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm6[3] 1169; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 1170; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm7 1171; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 1172; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 1173; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] 1174; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 1175; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3] 1176; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 1177; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 1178; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1179; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 1180; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 1181; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm0, %xmm0 1182; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3] 1183; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1184; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rdx) 1185; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rcx) 1186; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r8) 1187; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%r9) 1188; AVX512DQ-FCP-NEXT: vzeroupper 1189; AVX512DQ-FCP-NEXT: retq 1190; 1191; AVX512BW-LABEL: load_i16_stride5_vf8: 1192; AVX512BW: # %bb.0: 1193; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1194; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1195; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] 1196; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1197; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] 1198; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1199; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] 1200; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1201; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] 1202; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1203; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] 1204; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1205; AVX512BW-NEXT: vmovdqa %xmm2, (%rsi) 1206; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx) 1207; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx) 1208; AVX512BW-NEXT: vmovdqa %xmm5, (%r8) 1209; AVX512BW-NEXT: vmovdqa %xmm6, (%r9) 1210; AVX512BW-NEXT: vzeroupper 1211; AVX512BW-NEXT: retq 1212; 1213; AVX512BW-FCP-LABEL: load_i16_stride5_vf8: 1214; AVX512BW-FCP: # %bb.0: 1215; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1216; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1217; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] 1218; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1219; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] 1220; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1221; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] 1222; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1223; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] 1224; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1225; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] 1226; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1227; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1228; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 1229; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 1230; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 1231; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 1232; AVX512BW-FCP-NEXT: vzeroupper 1233; AVX512BW-FCP-NEXT: retq 1234; 1235; AVX512DQ-BW-LABEL: load_i16_stride5_vf8: 1236; AVX512DQ-BW: # %bb.0: 1237; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 1238; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1239; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] 1240; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1241; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] 1242; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1243; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] 1244; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1245; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] 1246; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1247; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] 1248; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1249; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%rsi) 1250; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx) 1251; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx) 1252; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8) 1253; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9) 1254; AVX512DQ-BW-NEXT: vzeroupper 1255; AVX512DQ-BW-NEXT: retq 1256; 1257; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf8: 1258; AVX512DQ-BW-FCP: # %bb.0: 1259; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1260; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1261; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,5,10,15,20,25,30,35] 1262; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 1263; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,6,11,16,21,26,31,36] 1264; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 1265; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,7,12,17,22,27,32,37] 1266; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4 1267; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,8,13,18,23,28,33,38] 1268; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5 1269; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,9,14,19,24,29,34,39] 1270; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6 1271; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%rsi) 1272; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx) 1273; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx) 1274; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8) 1275; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9) 1276; AVX512DQ-BW-FCP-NEXT: vzeroupper 1277; AVX512DQ-BW-FCP-NEXT: retq 1278 %wide.vec = load <40 x i16>, ptr %in.vec, align 64 1279 %strided.vec0 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35> 1280 %strided.vec1 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36> 1281 %strided.vec2 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37> 1282 %strided.vec3 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38> 1283 %strided.vec4 = shufflevector <40 x i16> %wide.vec, <40 x i16> poison, <8 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39> 1284 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64 1285 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64 1286 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64 1287 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64 1288 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64 1289 ret void 1290} 1291 1292define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 1293; SSE-LABEL: load_i16_stride5_vf16: 1294; SSE: # %bb.0: 1295; SSE-NEXT: movdqa 144(%rdi), %xmm14 1296; SSE-NEXT: movdqa 80(%rdi), %xmm8 1297; SSE-NEXT: movdqa 96(%rdi), %xmm7 1298; SSE-NEXT: movdqa 128(%rdi), %xmm15 1299; SSE-NEXT: movdqa 112(%rdi), %xmm12 1300; SSE-NEXT: movdqa 64(%rdi), %xmm10 1301; SSE-NEXT: movdqa (%rdi), %xmm11 1302; SSE-NEXT: movdqa 16(%rdi), %xmm9 1303; SSE-NEXT: movdqa 32(%rdi), %xmm13 1304; SSE-NEXT: movdqa 48(%rdi), %xmm5 1305; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] 1306; SSE-NEXT: movdqa %xmm0, %xmm1 1307; SSE-NEXT: pandn %xmm13, %xmm1 1308; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] 1309; SSE-NEXT: pand %xmm0, %xmm2 1310; SSE-NEXT: por %xmm1, %xmm2 1311; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[3,1,2,3] 1312; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 1313; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] 1314; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] 1315; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1316; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] 1317; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,65535,65535,65535,65535,0] 1318; SSE-NEXT: andps %xmm6, %xmm4 1319; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,0,1] 1320; SSE-NEXT: movaps %xmm6, %xmm2 1321; SSE-NEXT: pandn %xmm1, %xmm2 1322; SSE-NEXT: por %xmm4, %xmm2 1323; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1324; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,1,0,3] 1325; SSE-NEXT: pand %xmm0, %xmm1 1326; SSE-NEXT: pandn %xmm12, %xmm0 1327; SSE-NEXT: por %xmm1, %xmm0 1328; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,1,2,3] 1329; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 1330; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,2,2,3] 1331; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 1332; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1333; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] 1334; SSE-NEXT: andps %xmm6, %xmm2 1335; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,1,0,1] 1336; SSE-NEXT: movaps %xmm6, %xmm1 1337; SSE-NEXT: andnps %xmm0, %xmm1 1338; SSE-NEXT: orps %xmm2, %xmm1 1339; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1340; SSE-NEXT: movdqa %xmm9, %xmm0 1341; SSE-NEXT: psrlq $48, %xmm0 1342; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] 1343; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] 1344; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1345; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] 1346; SSE-NEXT: movdqa %xmm0, %xmm2 1347; SSE-NEXT: pandn %xmm1, %xmm2 1348; SSE-NEXT: movdqa %xmm5, %xmm3 1349; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,3,2,3] 1350; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 1351; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1352; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] 1353; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 1354; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1355; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] 1356; SSE-NEXT: pand %xmm0, %xmm1 1357; SSE-NEXT: por %xmm2, %xmm1 1358; SSE-NEXT: movdqa %xmm10, %xmm5 1359; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1360; SSE-NEXT: movdqa %xmm10, %xmm2 1361; SSE-NEXT: psllq $48, %xmm2 1362; SSE-NEXT: movaps %xmm6, %xmm4 1363; SSE-NEXT: andnps %xmm2, %xmm4 1364; SSE-NEXT: pand %xmm6, %xmm1 1365; SSE-NEXT: orps %xmm1, %xmm4 1366; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1367; SSE-NEXT: movdqa %xmm7, %xmm1 1368; SSE-NEXT: psrlq $48, %xmm1 1369; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,3,2,3] 1370; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] 1371; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1372; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,3,2,3] 1373; SSE-NEXT: movdqa %xmm15, %xmm10 1374; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,2,2,3] 1375; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1376; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm4[0,1,2,3,7,5,6,7] 1377; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 1378; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 1379; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] 1380; SSE-NEXT: pand %xmm0, %xmm1 1381; SSE-NEXT: pandn %xmm2, %xmm0 1382; SSE-NEXT: por %xmm1, %xmm0 1383; SSE-NEXT: pand %xmm6, %xmm0 1384; SSE-NEXT: movdqa %xmm14, %xmm4 1385; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1386; SSE-NEXT: movdqa %xmm14, %xmm1 1387; SSE-NEXT: psllq $48, %xmm1 1388; SSE-NEXT: pandn %xmm1, %xmm6 1389; SSE-NEXT: por %xmm0, %xmm6 1390; SSE-NEXT: movdqa %xmm3, %xmm0 1391; SSE-NEXT: movdqa %xmm3, %xmm14 1392; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1393; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0] 1394; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3] 1395; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] 1396; SSE-NEXT: movaps {{.*#+}} xmm3 = [65535,65535,65535,0,0,0,65535,65535] 1397; SSE-NEXT: movaps %xmm3, %xmm1 1398; SSE-NEXT: andnps %xmm0, %xmm1 1399; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,1,1,3] 1400; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm2[0,1,2,3,4,7,6,7] 1401; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3] 1402; SSE-NEXT: pand %xmm3, %xmm15 1403; SSE-NEXT: por %xmm1, %xmm15 1404; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 1405; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] 1406; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 1407; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 1408; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,0] 1409; SSE-NEXT: movdqa %xmm10, %xmm5 1410; SSE-NEXT: movdqa %xmm10, %xmm1 1411; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0] 1412; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3] 1413; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 1414; SSE-NEXT: movaps %xmm3, %xmm2 1415; SSE-NEXT: andnps %xmm1, %xmm2 1416; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,1,1,3] 1417; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7] 1418; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm7[2],xmm0[3],xmm7[3] 1419; SSE-NEXT: pand %xmm3, %xmm0 1420; SSE-NEXT: por %xmm2, %xmm0 1421; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 1422; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,2,0] 1423; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] 1424; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 1425; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 1426; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] 1427; SSE-NEXT: movdqa %xmm3, %xmm2 1428; SSE-NEXT: pandn %xmm1, %xmm2 1429; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] 1430; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 1431; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 1432; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] 1433; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 1434; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] 1435; SSE-NEXT: pand %xmm3, %xmm1 1436; SSE-NEXT: por %xmm2, %xmm1 1437; SSE-NEXT: movdqa %xmm14, %xmm4 1438; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm13[3,0] 1439; SSE-NEXT: movdqa %xmm3, %xmm2 1440; SSE-NEXT: pandn %xmm13, %xmm2 1441; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm4[0,2] 1442; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,4,6,7] 1443; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 1444; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm10[0,1,0,3] 1445; SSE-NEXT: pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5,5,6] 1446; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm4[2,3] 1447; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,0] 1448; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[2,2,2,2,4,5,6,7] 1449; SSE-NEXT: movdqa %xmm3, %xmm14 1450; SSE-NEXT: pandn %xmm4, %xmm14 1451; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[1,1,1,1] 1452; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3] 1453; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1] 1454; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm13[0,3,2,3,4,5,6,7] 1455; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 1456; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm4[1,0,3,3,4,5,6,7] 1457; SSE-NEXT: pand %xmm3, %xmm13 1458; SSE-NEXT: por %xmm14, %xmm13 1459; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[3,0] 1460; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm11[0,2] 1461; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm7[3,0] 1462; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm8[0,2] 1463; SSE-NEXT: movdqa %xmm5, %xmm11 1464; SSE-NEXT: movdqa %xmm5, %xmm4 1465; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[2,0],xmm12[3,0] 1466; SSE-NEXT: pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,4,6,6,7] 1467; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3] 1468; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm8[2,1,3,3,4,5,6,7] 1469; SSE-NEXT: pand %xmm3, %xmm8 1470; SSE-NEXT: pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7] 1471; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 1472; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm7[2,1,3,3,4,5,6,7] 1473; SSE-NEXT: pand %xmm3, %xmm7 1474; SSE-NEXT: pandn %xmm12, %xmm3 1475; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm4[0,2] 1476; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,4,6,7] 1477; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 1478; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,1,0,3] 1479; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,5,6] 1480; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm4[2,3] 1481; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm5[2,0] 1482; SSE-NEXT: por %xmm2, %xmm8 1483; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 1484; SSE-NEXT: # xmm2 = mem[0,2,2,3] 1485; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[0,1,1,3] 1486; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 1487; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] 1488; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] 1489; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm4[2,0] 1490; SSE-NEXT: por %xmm7, %xmm3 1491; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm11[0,2,2,3] 1492; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,1,1,3] 1493; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 1494; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] 1495; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[1,3] 1496; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,0] 1497; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1498; SSE-NEXT: movaps %xmm2, 16(%rsi) 1499; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1500; SSE-NEXT: movaps %xmm2, (%rsi) 1501; SSE-NEXT: movdqa %xmm6, 16(%rdx) 1502; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1503; SSE-NEXT: movaps %xmm2, (%rdx) 1504; SSE-NEXT: movaps %xmm0, 16(%rcx) 1505; SSE-NEXT: movaps %xmm15, (%rcx) 1506; SSE-NEXT: movaps %xmm13, 16(%r8) 1507; SSE-NEXT: movaps %xmm1, (%r8) 1508; SSE-NEXT: movaps %xmm3, 16(%r9) 1509; SSE-NEXT: movaps %xmm8, (%r9) 1510; SSE-NEXT: retq 1511; 1512; AVX-LABEL: load_i16_stride5_vf16: 1513; AVX: # %bb.0: 1514; AVX-NEXT: vmovdqa 96(%rdi), %xmm0 1515; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,1,1,3] 1516; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,4,7] 1517; AVX-NEXT: vmovdqa 112(%rdi), %xmm1 1518; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm3 = xmm2[1],xmm1[1] 1519; AVX-NEXT: vmovdqa 80(%rdi), %xmm2 1520; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] 1521; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 1522; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3,4,5,6,7] 1523; AVX-NEXT: vmovdqa 144(%rdi), %xmm8 1524; AVX-NEXT: vmovdqa 128(%rdi), %xmm7 1525; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1],xmm8[2,3],xmm7[4,5,6,7] 1526; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] 1527; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm3[0,1,2,3,4],xmm4[5,6,7] 1528; AVX-NEXT: vmovdqa (%rdi), %xmm3 1529; AVX-NEXT: vmovdqa 16(%rdi), %xmm4 1530; AVX-NEXT: vmovdqa 32(%rdi), %xmm5 1531; AVX-NEXT: vmovdqa 48(%rdi), %xmm6 1532; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm6[0,1,0,3] 1533; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm5[4],xmm9[5,6,7] 1534; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[3,1,2,3] 1535; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[2,1,2,3,4,5,6,7] 1536; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3] 1537; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,3,2,3,4,5,6,7] 1538; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1] 1539; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2,3],xmm9[4,5,6,7] 1540; AVX-NEXT: vmovaps {{.*#+}} ymm11 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] 1541; AVX-NEXT: vandps %ymm11, %ymm9, %ymm12 1542; AVX-NEXT: vmovaps 64(%rdi), %xmm9 1543; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm9[0,1,0,1] 1544; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13 1545; AVX-NEXT: vorps %ymm13, %ymm12, %ymm12 1546; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm12, %ymm10 1547; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 1548; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,3,0,4,5,6,7] 1549; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5] 1550; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,3,2,3] 1551; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] 1552; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7] 1553; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] 1554; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] 1555; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] 1556; AVX-NEXT: vpsllq $48, %xmm9, %xmm13 1557; AVX-NEXT: vandnps %ymm13, %ymm11, %ymm13 1558; AVX-NEXT: vpsrlq $48, %xmm4, %xmm14 1559; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,3,2,3] 1560; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,2,2,3,4,5,6,7] 1561; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1] 1562; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm6[2,3],xmm5[4,5],xmm6[6,7] 1563; AVX-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,0,1,10,11,4,5,14,15,6,7] 1564; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5,6,7] 1565; AVX-NEXT: vandps %ymm11, %ymm14, %ymm11 1566; AVX-NEXT: vorps %ymm13, %ymm11, %ymm11 1567; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm11, %ymm11 1568; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0,1,2,3],xmm0[4,5],xmm1[6,7] 1569; AVX-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,u,u,8,9,2,3,12,13,12,13,u,u,u,u] 1570; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[3,1,2,3] 1571; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[2,1,2,3,4,5,6,7] 1572; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3,4,5,6,7] 1573; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7] 1574; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] 1575; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4],xmm13[5,6,7] 1576; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[0,1,1,3] 1577; AVX-NEXT: vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,7,6,7] 1578; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3] 1579; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] 1580; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,2,3,12,13,6,7,u,u,u,u] 1581; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4,5],xmm13[6,7] 1582; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,2,0] 1583; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,6,5] 1584; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,5],xmm14[6,7] 1585; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm13, %ymm12 1586; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 1587; AVX-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,u,u,u,u,u,u] 1588; AVX-NEXT: vpsrlq $48, %xmm2, %xmm14 1589; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm14[0],xmm13[1,2,3,4,5,6,7] 1590; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm8[0,1,2,3],xmm7[4,5],xmm8[6,7] 1591; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] 1592; AVX-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3,4],xmm14[5,6,7] 1593; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3],xmm4[4,5,6,7] 1594; AVX-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] 1595; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm5[0,1,2,3],xmm6[4,5],xmm5[6,7] 1596; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] 1597; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] 1598; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm15[3,4,5],xmm14[6,7] 1599; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] 1600; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] 1601; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5],xmm15[6,7] 1602; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm14, %ymm13 1603; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3] 1604; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,2,1,4,5,6,7] 1605; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[0,2,2,3] 1606; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] 1607; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1] 1608; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,3,2,3] 1609; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3],xmm0[4,5,6,7] 1610; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3] 1611; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7] 1612; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm7[4,5,6,7] 1613; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[1,1,1,1] 1614; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,2,2,3] 1615; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 1616; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1617; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm3[4,5],xmm4[6,7] 1618; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 1619; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] 1620; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,1,3] 1621; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 1622; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] 1623; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 1624; AVX-NEXT: vmovaps %ymm10, (%rsi) 1625; AVX-NEXT: vmovaps %ymm11, (%rdx) 1626; AVX-NEXT: vmovaps %ymm12, (%rcx) 1627; AVX-NEXT: vmovaps %ymm13, (%r8) 1628; AVX-NEXT: vmovaps %ymm0, (%r9) 1629; AVX-NEXT: vzeroupper 1630; AVX-NEXT: retq 1631; 1632; AVX2-LABEL: load_i16_stride5_vf16: 1633; AVX2: # %bb.0: 1634; AVX2-NEXT: vmovdqa (%rdi), %ymm2 1635; AVX2-NEXT: vmovdqa 32(%rdi), %ymm3 1636; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0 1637; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1 1638; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 1639; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 1640; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] 1641; AVX2-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 1642; AVX2-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 1643; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] 1644; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] 1645; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 1646; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] 1647; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 1648; AVX2-NEXT: vmovdqa 144(%rdi), %xmm6 1649; AVX2-NEXT: vmovdqa 128(%rdi), %xmm4 1650; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0],xmm6[1],xmm4[2,3] 1651; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] 1652; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 1653; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0,1,2,3,4],ymm8[5,6,7],ymm5[8,9,10,11,12],ymm8[13,14,15] 1654; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm8[4,5,6,7] 1655; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 1656; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 1657; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] 1658; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 1659; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1660; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 1661; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] 1662; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 1663; AVX2-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 1664; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] 1665; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] 1666; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 1667; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] 1668; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 1669; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 1670; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 1671; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] 1672; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 1673; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1674; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 1675; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] 1676; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 1677; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] 1678; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] 1679; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] 1680; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 1681; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] 1682; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 1683; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 1684; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm10 1685; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 1686; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 1687; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1688; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 1689; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] 1690; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 1691; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 1692; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] 1693; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] 1694; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 1695; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] 1696; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 1697; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 1698; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 1699; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 1700; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 1701; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 1702; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 1703; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] 1704; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] 1705; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 1706; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] 1707; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] 1708; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 1709; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 1710; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1711; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 1712; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1713; AVX2-NEXT: vmovdqa %ymm5, (%rsi) 1714; AVX2-NEXT: vmovdqa %ymm7, (%rdx) 1715; AVX2-NEXT: vmovdqa %ymm8, (%rcx) 1716; AVX2-NEXT: vmovdqa %ymm9, (%r8) 1717; AVX2-NEXT: vmovdqa %ymm0, (%r9) 1718; AVX2-NEXT: vzeroupper 1719; AVX2-NEXT: retq 1720; 1721; AVX2-FP-LABEL: load_i16_stride5_vf16: 1722; AVX2-FP: # %bb.0: 1723; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm2 1724; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm3 1725; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0 1726; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1 1727; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 1728; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 1729; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] 1730; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 1731; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 1732; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] 1733; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5],ymm5[6],ymm6[7] 1734; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 1735; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [65535,65535,65535,65535,65535,65535,65535,0] 1736; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm6 1737; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm4 1738; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm5 1739; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0],xmm4[1],xmm5[2,3] 1740; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] 1741; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 1742; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0,1,2,3,4],ymm8[5,6,7],ymm6[8,9,10,11,12],ymm8[13,14,15] 1743; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm8[4,5,6,7] 1744; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 1745; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 1746; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] 1747; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 1748; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1749; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 1750; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm10[5],ymm9[6,7] 1751; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 1752; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm8, %ymm9, %ymm7 1753; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm5[0,1],xmm4[2],xmm5[3] 1754; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] 1755; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 1756; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] 1757; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 1758; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 1759; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 1760; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] 1761; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 1762; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1763; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 1764; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] 1765; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 1766; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] 1767; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm4[0],xmm5[1],xmm4[2,3] 1768; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] 1769; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 1770; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] 1771; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 1772; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 1773; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm10 1774; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 1775; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 1776; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1777; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 1778; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] 1779; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 1780; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 1781; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm4[0,1],xmm5[2],xmm4[3] 1782; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] 1783; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 1784; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] 1785; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 1786; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 1787; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3 1788; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 1789; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 1790; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 1791; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 1792; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] 1793; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] 1794; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 1795; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12,13,14,15,4,5,14,15,u,u,u,u,u,u,u,u] 1796; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = xmm5[0,1,2,3,0,1,10,11,u,u,u,u,u,u,u,u] 1797; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1798; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 1799; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1800; AVX2-FP-NEXT: vmovdqa %ymm6, (%rsi) 1801; AVX2-FP-NEXT: vmovdqa %ymm7, (%rdx) 1802; AVX2-FP-NEXT: vmovdqa %ymm8, (%rcx) 1803; AVX2-FP-NEXT: vmovdqa %ymm9, (%r8) 1804; AVX2-FP-NEXT: vmovdqa %ymm0, (%r9) 1805; AVX2-FP-NEXT: vzeroupper 1806; AVX2-FP-NEXT: retq 1807; 1808; AVX2-FCP-LABEL: load_i16_stride5_vf16: 1809; AVX2-FCP: # %bb.0: 1810; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0 1811; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3 1812; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm4 1813; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1 1814; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2 1815; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15] 1816; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1817; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] 1818; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 1819; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm2[1,2],ymm1[3],ymm2[4],ymm1[5],ymm2[6,7],ymm1[8],ymm2[9,10],ymm1[11],ymm2[12],ymm1[13],ymm2[14,15] 1820; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,0,2,4,6,1,3] 1821; AVX2-FCP-NEXT: vpermd %ymm6, %ymm7, %ymm6 1822; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] 1823; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [65535,65535,65535,65535,65535,65535,65535,0] 1824; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5 1825; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] 1826; AVX2-FCP-NEXT: vpermd %ymm0, %ymm6, %ymm6 1827; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] 1828; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 1829; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] 1830; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 1831; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm4[0],ymm3[1],ymm4[2],ymm3[3],ymm4[4,5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10],ymm3[11],ymm4[12,13],ymm3[14],ymm4[15] 1832; AVX2-FCP-NEXT: vextracti128 $1, %ymm6, %xmm9 1833; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7] 1834; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 1835; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 1836; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,0,0,0,4,7,1,6] 1837; AVX2-FCP-NEXT: vpermd %ymm9, %ymm10, %ymm9 1838; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] 1839; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm9, %ymm6 1840; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] 1841; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm9 1842; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] 1843; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 1844; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] 1845; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] 1846; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0,1],ymm3[2],ymm4[3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8,9],ymm3[10],ymm4[11],ymm3[12],ymm4[13,14],ymm3[15] 1847; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 1848; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] 1849; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 1850; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 1851; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] 1852; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 1853; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 1854; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 1855; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] 1856; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] 1857; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 1858; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 1859; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] 1860; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] 1861; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] 1862; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 1863; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 1864; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 1865; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 1866; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] 1867; AVX2-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 1868; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 1869; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 1870; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] 1871; AVX2-FCP-NEXT: # ymm10 = mem[0,1,0,1] 1872; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10 1873; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 1874; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] 1875; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 1876; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] 1877; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 1878; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7] 1879; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 1880; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 1881; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5] 1882; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1 1883; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] 1884; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7] 1885; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,0,0,0,0,2,5,7] 1886; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 1887; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31] 1888; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7] 1889; AVX2-FCP-NEXT: vmovdqa %ymm5, (%rsi) 1890; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rdx) 1891; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rcx) 1892; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r8) 1893; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) 1894; AVX2-FCP-NEXT: vzeroupper 1895; AVX2-FCP-NEXT: retq 1896; 1897; AVX512-LABEL: load_i16_stride5_vf16: 1898; AVX512: # %bb.0: 1899; AVX512-NEXT: vmovdqa (%rdi), %ymm2 1900; AVX512-NEXT: vmovdqa 32(%rdi), %ymm3 1901; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 1902; AVX512-NEXT: vmovdqa 96(%rdi), %ymm1 1903; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 1904; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 1905; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7] 1906; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero 1907; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 1908; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 1909; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] 1910; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] 1911; AVX512-NEXT: vpor %ymm4, %ymm5, %ymm5 1912; AVX512-NEXT: vmovdqa 144(%rdi), %xmm6 1913; AVX512-NEXT: vmovdqa 128(%rdi), %xmm4 1914; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3] 1915; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] 1916; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 1917; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] 1918; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] 1919; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 1920; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] 1921; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] 1922; AVX512-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero 1923; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 1924; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 1925; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] 1926; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] 1927; AVX512-NEXT: vpor %ymm7, %ymm8, %ymm7 1928; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] 1929; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] 1930; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 1931; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] 1932; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 1933; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 1934; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 1935; AVX512-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] 1936; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 1937; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 1938; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 1939; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] 1940; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 1941; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] 1942; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] 1943; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] 1944; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 1945; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] 1946; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 1947; AVX512-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 1948; AVX512-NEXT: vextracti128 $1, %ymm9, %xmm10 1949; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 1950; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 1951; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 1952; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 1953; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] 1954; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 1955; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 1956; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] 1957; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] 1958; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 1959; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] 1960; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 1961; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 1962; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 1963; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 1964; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 1965; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 1966; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 1967; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] 1968; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] 1969; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 1970; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] 1971; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] 1972; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 1973; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 1974; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 1975; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 1976; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1977; AVX512-NEXT: vmovdqa %ymm5, (%rsi) 1978; AVX512-NEXT: vmovdqa %ymm7, (%rdx) 1979; AVX512-NEXT: vmovdqa %ymm8, (%rcx) 1980; AVX512-NEXT: vmovdqa %ymm9, (%r8) 1981; AVX512-NEXT: vmovdqa %ymm0, (%r9) 1982; AVX512-NEXT: vzeroupper 1983; AVX512-NEXT: retq 1984; 1985; AVX512-FCP-LABEL: load_i16_stride5_vf16: 1986; AVX512-FCP: # %bb.0: 1987; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2 1988; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 1989; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 1990; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 1991; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 1992; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] 1993; AVX512-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 1994; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero 1995; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 1996; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 1997; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] 1998; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] 1999; AVX512-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 2000; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] 2001; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 2002; AVX512-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 2003; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] 2004; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 2005; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] 2006; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 2007; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 2008; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6] 2009; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 2010; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero 2011; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 2012; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 2013; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] 2014; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] 2015; AVX512-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 2016; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] 2017; AVX512-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 2018; AVX512-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] 2019; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 2020; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] 2021; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] 2022; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 2023; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 2024; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] 2025; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 2026; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 2027; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] 2028; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 2029; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 2030; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 2031; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] 2032; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] 2033; AVX512-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 2034; AVX512-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 2035; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] 2036; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] 2037; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 2038; AVX512-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 2039; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 2040; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 2041; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 2042; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] 2043; AVX512-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 2044; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 2045; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 2046; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] 2047; AVX512-FCP-NEXT: # ymm10 = mem[0,1,0,1] 2048; AVX512-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 2049; AVX512-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 2050; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] 2051; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 2052; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 2053; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 2054; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 2055; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 2056; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 2057; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] 2058; AVX512-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 2059; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] 2060; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 2061; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7] 2062; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 2063; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] 2064; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] 2065; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 2066; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rsi) 2067; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rdx) 2068; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rcx) 2069; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r8) 2070; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9) 2071; AVX512-FCP-NEXT: vzeroupper 2072; AVX512-FCP-NEXT: retq 2073; 2074; AVX512DQ-LABEL: load_i16_stride5_vf16: 2075; AVX512DQ: # %bb.0: 2076; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2 2077; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm3 2078; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 2079; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm1 2080; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 2081; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 2082; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6],ymm5[7] 2083; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero 2084; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 2085; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 2086; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] 2087; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] 2088; AVX512DQ-NEXT: vpor %ymm4, %ymm5, %ymm5 2089; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm6 2090; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm4 2091; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm4[0],xmm6[1],xmm4[2,3] 2092; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,u,u,u,u,2,3,12,13,6,7] 2093; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 2094; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm5[0,1,2,3,4],ymm7[5,6,7],ymm5[8,9,10,11,12],ymm7[13,14,15] 2095; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7] 2096; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 2097; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm7[2,3,0,1] 2098; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7] 2099; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero 2100; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 2101; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 2102; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] 2103; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] 2104; AVX512DQ-NEXT: vpor %ymm7, %ymm8, %ymm7 2105; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm6[2],xmm4[3] 2106; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,u,u,u,u,u,4,5,14,15,8,9] 2107; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 2108; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0,1,2,3,4],ymm8[5,6,7],ymm7[8,9,10,11,12],ymm8[13,14,15] 2109; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7] 2110; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 2111; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 2112; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1,2],xmm9[3,4],xmm8[5,6,7] 2113; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 2114; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 2115; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm9[2,3,0,1] 2116; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm10[6],ymm9[7] 2117; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 2118; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2],ymm9[3,4,5,6,7] 2119; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm6[0],xmm4[1],xmm6[2,3] 2120; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,u,u,6,7,0,1,10,11] 2121; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 2122; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] 2123; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7] 2124; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 2125; AVX512DQ-NEXT: vextracti128 $1, %ymm9, %xmm10 2126; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 2127; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 2128; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 2129; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 2130; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7] 2131; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 2132; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 2133; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm6[0,1],xmm4[2],xmm6[3] 2134; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u,u,u,u,u,u,u,u,8,9,2,3,12,13] 2135; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 2136; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm9[0,1,2,3,4],ymm10[5,6,7],ymm9[8,9,10,11,12],ymm10[13,14,15] 2137; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7] 2138; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 2139; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 2140; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 2141; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 2142; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 2143; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 2144; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] 2145; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,u,u,u,u,u,u,u,u] 2146; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 2147; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3] 2148; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] 2149; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 2150; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7] 2151; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2152; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 2153; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2154; AVX512DQ-NEXT: vmovdqa %ymm5, (%rsi) 2155; AVX512DQ-NEXT: vmovdqa %ymm7, (%rdx) 2156; AVX512DQ-NEXT: vmovdqa %ymm8, (%rcx) 2157; AVX512DQ-NEXT: vmovdqa %ymm9, (%r8) 2158; AVX512DQ-NEXT: vmovdqa %ymm0, (%r9) 2159; AVX512DQ-NEXT: vzeroupper 2160; AVX512DQ-NEXT: retq 2161; 2162; AVX512DQ-FCP-LABEL: load_i16_stride5_vf16: 2163; AVX512DQ-FCP: # %bb.0: 2164; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2 2165; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 2166; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 2167; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 2168; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 2169; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,0,0,0,4,6,1,3] 2170; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm4 2171; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero 2172; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15] 2173; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm6 2174; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0],xmm6[1,2,3],xmm5[4,5],xmm6[6,7] 2175; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[u,u,u,u,u,u] 2176; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm5, %ymm5 2177; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,0,3,5,0] 2178; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 2179; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6 2180; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] 2181; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 2182; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15] 2183; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7] 2184; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 2185; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [2,0,0,0,4,7,1,6] 2186; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6 2187; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero 2188; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15] 2189; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 2190; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] 2191; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[u,u,u,u,u,u] 2192; AVX512DQ-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 2193; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,1,3,6,0] 2194; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm9 2195; AVX512DQ-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] 2196; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm9 2197; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm6[0,1,2,3,4],ymm9[5,6,7],ymm6[8,9,10,11,12],ymm9[13,14,15] 2198; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7] 2199; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm3[0,1],ymm2[2],ymm3[3],ymm2[4],ymm3[5,6],ymm2[7],ymm3[8,9],ymm2[10],ymm3[11],ymm2[12],ymm3[13,14],ymm2[15] 2200; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 2201; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7] 2202; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 2203; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 2204; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,0,0,5,7,2,4] 2205; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 2206; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 2207; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 2208; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [1,4,6,0,1,4,6,0] 2209; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] 2210; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 2211; AVX512DQ-FCP-NEXT: vpshufb %ymm7, %ymm10, %ymm7 2212; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm9[0,1,2,3,4],ymm7[5,6,7],ymm9[8,9,10,11,12],ymm7[13,14,15] 2213; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7] 2214; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0],ymm3[1,2],ymm2[3],ymm3[4],ymm2[5],ymm3[6,7],ymm2[8],ymm3[9,10],ymm2[11],ymm3[12],ymm2[13],ymm3[14,15] 2215; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 2216; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3] 2217; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 2218; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 2219; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] 2220; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm11, %ymm10 2221; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 2222; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7] 2223; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [2,4,7,0,2,4,7,0] 2224; AVX512DQ-FCP-NEXT: # ymm10 = mem[0,1,0,1] 2225; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm10, %ymm10 2226; AVX512DQ-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm8 2227; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7],ymm9[8,9,10,11,12],ymm8[13,14,15] 2228; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 2229; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13],ymm2[14],ymm3[15] 2230; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3 2231; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 2232; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 2233; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 2234; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,0,6,0,3,5] 2235; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm1, %ymm0 2236; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u] 2237; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 2238; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,0,0,0,0,2,5,7] 2239; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1 2240; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,u,u,u,u,24,25,30,31,u,u,u,u] 2241; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,1,2,3,4,5,12,14] 2242; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 2243; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rsi) 2244; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rdx) 2245; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rcx) 2246; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r8) 2247; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9) 2248; AVX512DQ-FCP-NEXT: vzeroupper 2249; AVX512DQ-FCP-NEXT: retq 2250; 2251; AVX512BW-LABEL: load_i16_stride5_vf16: 2252; AVX512BW: # %bb.0: 2253; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 2254; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 2255; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 2256; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2257; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] 2258; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm4 2259; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 2260; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 2261; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2262; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] 2263; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 2264; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 2265; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2266; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] 2267; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 2268; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 2269; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2270; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] 2271; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 2272; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 2273; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2274; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] 2275; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 2276; AVX512BW-NEXT: vmovdqa %ymm3, (%rsi) 2277; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx) 2278; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx) 2279; AVX512BW-NEXT: vmovdqa %ymm7, (%r8) 2280; AVX512BW-NEXT: vmovdqa %ymm0, (%r9) 2281; AVX512BW-NEXT: vzeroupper 2282; AVX512BW-NEXT: retq 2283; 2284; AVX512BW-FCP-LABEL: load_i16_stride5_vf16: 2285; AVX512BW-FCP: # %bb.0: 2286; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 2287; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 2288; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 2289; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2290; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] 2291; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 2292; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 2293; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 2294; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2295; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] 2296; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 2297; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 2298; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2299; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] 2300; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 2301; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 2302; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2303; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] 2304; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 2305; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 2306; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2307; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] 2308; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 2309; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rsi) 2310; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 2311; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) 2312; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8) 2313; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%r9) 2314; AVX512BW-FCP-NEXT: vzeroupper 2315; AVX512BW-FCP-NEXT: retq 2316; 2317; AVX512DQ-BW-LABEL: load_i16_stride5_vf16: 2318; AVX512DQ-BW: # %bb.0: 2319; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 2320; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 2321; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 2322; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2323; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] 2324; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm4 2325; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 2326; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 2327; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2328; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] 2329; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 2330; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 2331; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2332; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] 2333; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 2334; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 2335; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2336; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] 2337; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 2338; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 2339; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2340; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] 2341; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 2342; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rsi) 2343; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx) 2344; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx) 2345; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8) 2346; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%r9) 2347; AVX512DQ-BW-NEXT: vzeroupper 2348; AVX512DQ-BW-NEXT: retq 2349; 2350; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf16: 2351; AVX512DQ-BW-FCP: # %bb.0: 2352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 2353; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 2354; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 2355; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2356; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27] 2357; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm4 2358; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 2359; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 2360; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2361; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28] 2362; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm5 2363; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 2364; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2365; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29] 2366; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm6 2367; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 2368; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 2369; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30] 2370; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm7 2371; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 2372; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 2373; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31] 2374; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm0 2375; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rsi) 2376; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx) 2377; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx) 2378; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8) 2379; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%r9) 2380; AVX512DQ-BW-FCP-NEXT: vzeroupper 2381; AVX512DQ-BW-FCP-NEXT: retq 2382 %wide.vec = load <80 x i16>, ptr %in.vec, align 64 2383 %strided.vec0 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75> 2384 %strided.vec1 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76> 2385 %strided.vec2 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77> 2386 %strided.vec3 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78> 2387 %strided.vec4 = shufflevector <80 x i16> %wide.vec, <80 x i16> poison, <16 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79> 2388 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64 2389 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64 2390 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64 2391 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64 2392 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64 2393 ret void 2394} 2395 2396define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 2397; SSE-LABEL: load_i16_stride5_vf32: 2398; SSE: # %bb.0: 2399; SSE-NEXT: subq $408, %rsp # imm = 0x198 2400; SSE-NEXT: movdqa 64(%rdi), %xmm4 2401; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2402; SSE-NEXT: movdqa (%rdi), %xmm6 2403; SSE-NEXT: movdqa 16(%rdi), %xmm13 2404; SSE-NEXT: movdqa 32(%rdi), %xmm9 2405; SSE-NEXT: movdqa 48(%rdi), %xmm5 2406; SSE-NEXT: movdqa 224(%rdi), %xmm7 2407; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2408; SSE-NEXT: movdqa 160(%rdi), %xmm11 2409; SSE-NEXT: movdqa 176(%rdi), %xmm12 2410; SSE-NEXT: movdqa 208(%rdi), %xmm8 2411; SSE-NEXT: movdqa 192(%rdi), %xmm2 2412; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2413; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] 2414; SSE-NEXT: movdqa %xmm0, %xmm1 2415; SSE-NEXT: pandn %xmm2, %xmm1 2416; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[0,1,0,3] 2417; SSE-NEXT: pand %xmm0, %xmm2 2418; SSE-NEXT: por %xmm1, %xmm2 2419; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] 2420; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2421; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 2422; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] 2423; SSE-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill 2424; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 2425; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2426; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] 2427; SSE-NEXT: movaps {{.*#+}} xmm15 = [65535,65535,65535,65535,65535,65535,65535,0] 2428; SSE-NEXT: andps %xmm15, %xmm3 2429; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,1] 2430; SSE-NEXT: movaps %xmm15, %xmm2 2431; SSE-NEXT: pandn %xmm1, %xmm2 2432; SSE-NEXT: por %xmm3, %xmm2 2433; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2434; SSE-NEXT: movdqa %xmm0, %xmm1 2435; SSE-NEXT: pandn %xmm9, %xmm1 2436; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2437; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[0,1,0,3] 2438; SSE-NEXT: movdqa %xmm5, %xmm7 2439; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2440; SSE-NEXT: pand %xmm0, %xmm2 2441; SSE-NEXT: por %xmm1, %xmm2 2442; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3] 2443; SSE-NEXT: movdqa %xmm13, %xmm5 2444; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2445; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 2446; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,2,2,3] 2447; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2448; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 2449; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2450; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] 2451; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,1,0,1] 2452; SSE-NEXT: movaps %xmm15, %xmm2 2453; SSE-NEXT: andnps %xmm1, %xmm2 2454; SSE-NEXT: movdqa 272(%rdi), %xmm4 2455; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2456; SSE-NEXT: andps %xmm15, %xmm3 2457; SSE-NEXT: orps %xmm3, %xmm2 2458; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2459; SSE-NEXT: movdqa %xmm0, %xmm1 2460; SSE-NEXT: pandn %xmm4, %xmm1 2461; SSE-NEXT: movdqa 288(%rdi), %xmm2 2462; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2463; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 2464; SSE-NEXT: pand %xmm0, %xmm2 2465; SSE-NEXT: por %xmm1, %xmm2 2466; SSE-NEXT: movdqa 256(%rdi), %xmm14 2467; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[3,1,2,3] 2468; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2469; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 2470; SSE-NEXT: movdqa 240(%rdi), %xmm13 2471; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3] 2472; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2473; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 2474; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2475; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] 2476; SSE-NEXT: movdqa 304(%rdi), %xmm1 2477; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2478; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 2479; SSE-NEXT: movaps %xmm15, %xmm2 2480; SSE-NEXT: andnps %xmm1, %xmm2 2481; SSE-NEXT: andps %xmm15, %xmm3 2482; SSE-NEXT: orps %xmm3, %xmm2 2483; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2484; SSE-NEXT: movdqa 128(%rdi), %xmm1 2485; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2486; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 2487; SSE-NEXT: pand %xmm0, %xmm1 2488; SSE-NEXT: movdqa 112(%rdi), %xmm2 2489; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2490; SSE-NEXT: pandn %xmm2, %xmm0 2491; SSE-NEXT: por %xmm1, %xmm0 2492; SSE-NEXT: movdqa 96(%rdi), %xmm1 2493; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2494; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 2495; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 2496; SSE-NEXT: movdqa 80(%rdi), %xmm4 2497; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 2498; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2499; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 2500; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2501; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3] 2502; SSE-NEXT: movdqa 144(%rdi), %xmm0 2503; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2504; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 2505; SSE-NEXT: movaps %xmm15, %xmm1 2506; SSE-NEXT: andnps %xmm0, %xmm1 2507; SSE-NEXT: andps %xmm15, %xmm2 2508; SSE-NEXT: orps %xmm2, %xmm1 2509; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2510; SSE-NEXT: psrlq $48, %xmm12 2511; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[0,3,2,3] 2512; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] 2513; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1] 2514; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] 2515; SSE-NEXT: movdqa %xmm0, %xmm2 2516; SSE-NEXT: pandn %xmm1, %xmm2 2517; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2518; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,3,2,3] 2519; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 2520; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3] 2521; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2522; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] 2523; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 2524; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 2525; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] 2526; SSE-NEXT: pand %xmm0, %xmm1 2527; SSE-NEXT: por %xmm2, %xmm1 2528; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2529; SSE-NEXT: movdqa %xmm10, %xmm2 2530; SSE-NEXT: psllq $48, %xmm2 2531; SSE-NEXT: movaps %xmm15, %xmm3 2532; SSE-NEXT: andnps %xmm2, %xmm3 2533; SSE-NEXT: pand %xmm15, %xmm1 2534; SSE-NEXT: orps %xmm1, %xmm3 2535; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2536; SSE-NEXT: psrlq $48, %xmm5 2537; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm6[0,3,2,3] 2538; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] 2539; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 2540; SSE-NEXT: movdqa %xmm0, %xmm1 2541; SSE-NEXT: pandn %xmm2, %xmm1 2542; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm7[1,3,2,3] 2543; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] 2544; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2545; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] 2546; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 2547; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 2548; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] 2549; SSE-NEXT: pand %xmm0, %xmm2 2550; SSE-NEXT: por %xmm1, %xmm2 2551; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2552; SSE-NEXT: movdqa %xmm6, %xmm1 2553; SSE-NEXT: psllq $48, %xmm1 2554; SSE-NEXT: movdqa %xmm15, %xmm3 2555; SSE-NEXT: pandn %xmm1, %xmm3 2556; SSE-NEXT: pand %xmm15, %xmm2 2557; SSE-NEXT: por %xmm2, %xmm3 2558; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2559; SSE-NEXT: movdqa %xmm14, %xmm1 2560; SSE-NEXT: psrlq $48, %xmm1 2561; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm13[0,3,2,3] 2562; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] 2563; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2564; SSE-NEXT: movdqa %xmm0, %xmm1 2565; SSE-NEXT: pandn %xmm2, %xmm1 2566; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2567; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,3,2,3] 2568; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 2569; SSE-NEXT: # xmm3 = mem[0,2,2,3] 2570; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2571; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7] 2572; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1] 2573; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] 2574; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,6,4,7] 2575; SSE-NEXT: pand %xmm0, %xmm2 2576; SSE-NEXT: por %xmm1, %xmm2 2577; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2578; SSE-NEXT: movdqa %xmm7, %xmm1 2579; SSE-NEXT: psllq $48, %xmm1 2580; SSE-NEXT: movdqa %xmm15, %xmm3 2581; SSE-NEXT: pandn %xmm1, %xmm3 2582; SSE-NEXT: pand %xmm15, %xmm2 2583; SSE-NEXT: por %xmm2, %xmm3 2584; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2585; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 2586; SSE-NEXT: movdqa %xmm13, %xmm1 2587; SSE-NEXT: psrlq $48, %xmm1 2588; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3] 2589; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] 2590; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2591; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 2592; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,3,2,3] 2593; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 2594; SSE-NEXT: # xmm3 = mem[0,2,2,3] 2595; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 2596; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,7,5,6,7] 2597; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 2598; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 2599; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] 2600; SSE-NEXT: pand %xmm0, %xmm1 2601; SSE-NEXT: pandn %xmm2, %xmm0 2602; SSE-NEXT: por %xmm1, %xmm0 2603; SSE-NEXT: pand %xmm15, %xmm0 2604; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2605; SSE-NEXT: movdqa %xmm5, %xmm1 2606; SSE-NEXT: psllq $48, %xmm1 2607; SSE-NEXT: pandn %xmm1, %xmm15 2608; SSE-NEXT: por %xmm0, %xmm15 2609; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2610; SSE-NEXT: movdqa %xmm8, %xmm0 2611; SSE-NEXT: movdqa %xmm11, %xmm8 2612; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm11[0,0] 2613; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[2,3] 2614; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] 2615; SSE-NEXT: movaps {{.*#+}} xmm11 = [65535,65535,65535,0,0,0,65535,65535] 2616; SSE-NEXT: movaps %xmm11, %xmm1 2617; SSE-NEXT: andnps %xmm0, %xmm1 2618; SSE-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload 2619; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,1,3] 2620; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 2621; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2622; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3] 2623; SSE-NEXT: pand %xmm11, %xmm2 2624; SSE-NEXT: por %xmm1, %xmm2 2625; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 2626; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,1,2,0] 2627; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 2628; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2629; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 2630; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2631; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2632; SSE-NEXT: movaps %xmm1, %xmm0 2633; SSE-NEXT: movaps %xmm1, %xmm15 2634; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2635; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm9[0,0] 2636; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[2,3] 2637; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] 2638; SSE-NEXT: movaps %xmm11, %xmm1 2639; SSE-NEXT: andnps %xmm0, %xmm1 2640; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2641; SSE-NEXT: # xmm2 = mem[0,1,1,3] 2642; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 2643; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2644; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 2645; SSE-NEXT: pand %xmm11, %xmm2 2646; SSE-NEXT: por %xmm1, %xmm2 2647; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 2648; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,2,0] 2649; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 2650; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2651; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 2652; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2653; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 2654; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0] 2655; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3] 2656; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,0,1,3] 2657; SSE-NEXT: movaps %xmm11, %xmm1 2658; SSE-NEXT: andnps %xmm14, %xmm1 2659; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2660; SSE-NEXT: # xmm2 = mem[0,1,1,3] 2661; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 2662; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2663; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 2664; SSE-NEXT: pand %xmm11, %xmm2 2665; SSE-NEXT: por %xmm1, %xmm2 2666; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,6,5,6,7] 2667; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,2,0] 2668; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 2669; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2670; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 2671; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2672; SSE-NEXT: movdqa %xmm12, %xmm0 2673; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2674; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2675; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3] 2676; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,1,3] 2677; SSE-NEXT: movaps %xmm11, %xmm1 2678; SSE-NEXT: andnps %xmm0, %xmm1 2679; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2680; SSE-NEXT: # xmm2 = mem[0,1,1,3] 2681; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 2682; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3] 2683; SSE-NEXT: pand %xmm11, %xmm2 2684; SSE-NEXT: por %xmm1, %xmm2 2685; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7] 2686; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,2,0] 2687; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5] 2688; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2689; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 2690; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2691; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[2,2,2,2,4,5,6,7] 2692; SSE-NEXT: movdqa %xmm11, %xmm1 2693; SSE-NEXT: pandn %xmm0, %xmm1 2694; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1] 2695; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 2696; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2697; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] 2698; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2699; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,3,4,5,6,7] 2700; SSE-NEXT: pand %xmm11, %xmm0 2701; SSE-NEXT: por %xmm1, %xmm0 2702; SSE-NEXT: movdqa %xmm0, %xmm2 2703; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2704; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,0],xmm8[3,0] 2705; SSE-NEXT: movaps %xmm11, %xmm0 2706; SSE-NEXT: andnps %xmm8, %xmm0 2707; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2708; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm3[0,2] 2709; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,4,6,7] 2710; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2711; SSE-NEXT: # xmm1 = mem[0,1,0,3] 2712; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] 2713; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2714; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,0] 2715; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2716; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[2,2,2,2,4,5,6,7] 2717; SSE-NEXT: movdqa %xmm11, %xmm1 2718; SSE-NEXT: pandn %xmm0, %xmm1 2719; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2720; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2721; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2722; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,2,2,3] 2723; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2724; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] 2725; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2726; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm0[1,0,3,3,4,5,6,7] 2727; SSE-NEXT: pand %xmm11, %xmm13 2728; SSE-NEXT: por %xmm1, %xmm13 2729; SSE-NEXT: movaps %xmm15, %xmm0 2730; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm9[3,0] 2731; SSE-NEXT: movdqa %xmm11, %xmm12 2732; SSE-NEXT: pandn %xmm9, %xmm12 2733; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[0,2] 2734; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,7,4,6,7] 2735; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2736; SSE-NEXT: # xmm1 = mem[0,1,0,3] 2737; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] 2738; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2739; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[2,0] 2740; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[2,2,2,2,4,5,6,7] 2741; SSE-NEXT: movdqa %xmm11, %xmm1 2742; SSE-NEXT: pandn %xmm0, %xmm1 2743; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2744; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1] 2745; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2746; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3] 2747; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2748; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] 2749; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2750; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm0[1,0,3,3,4,5,6,7] 2751; SSE-NEXT: pand %xmm11, %xmm9 2752; SSE-NEXT: por %xmm1, %xmm9 2753; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2754; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[3,0] 2755; SSE-NEXT: movdqa %xmm11, %xmm15 2756; SSE-NEXT: pandn %xmm10, %xmm15 2757; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,2] 2758; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,4,6,7] 2759; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[0,1,0,3] 2760; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,6] 2761; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2762; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,0] 2763; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2764; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,2,2,2,4,5,6,7] 2765; SSE-NEXT: movdqa %xmm11, %xmm1 2766; SSE-NEXT: pandn %xmm0, %xmm1 2767; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2768; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1] 2769; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2770; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3] 2771; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2772; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,3,2,3,4,5,6,7] 2773; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2774; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm0[1,0,3,3,4,5,6,7] 2775; SSE-NEXT: pand %xmm11, %xmm10 2776; SSE-NEXT: por %xmm1, %xmm10 2777; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2778; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 2779; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 2780; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2] 2781; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm14[3,0] 2782; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm8[0,2] 2783; SSE-NEXT: movaps %xmm14, %xmm2 2784; SSE-NEXT: movdqa %xmm3, %xmm1 2785; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[2,0],xmm3[3,0] 2786; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm6[0,2] 2787; SSE-NEXT: movaps %xmm1, %xmm14 2788; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm4[3,0] 2789; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0,2] 2790; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 2791; SSE-NEXT: movaps %xmm8, %xmm1 2792; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] 2793; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7] 2794; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2795; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,1,3,3,4,5,6,7] 2796; SSE-NEXT: pand %xmm11, %xmm3 2797; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,6,6,7] 2798; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2799; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[2,1,3,3,4,5,6,7] 2800; SSE-NEXT: pand %xmm11, %xmm2 2801; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,6,6,7] 2802; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 2803; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,1,3,3,4,5,6,7] 2804; SSE-NEXT: pand %xmm11, %xmm0 2805; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7] 2806; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 2807; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[2,1,3,3,4,5,6,7] 2808; SSE-NEXT: pand %xmm11, %xmm4 2809; SSE-NEXT: pandn %xmm7, %xmm11 2810; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] 2811; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] 2812; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 2813; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[0,1,0,3] 2814; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,5,6] 2815; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[2,3] 2816; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,0] 2817; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 2818; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2819; SSE-NEXT: # xmm1 = mem[0,2,2,3] 2820; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload 2821; SSE-NEXT: # xmm14 = mem[0,1,1,3] 2822; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 2823; SSE-NEXT: pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] 2824; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[3,1],xmm1[1,3] 2825; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm14[2,0] 2826; SSE-NEXT: por %xmm12, %xmm2 2827; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2828; SSE-NEXT: # xmm1 = mem[0,2,2,3] 2829; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 2830; SSE-NEXT: # xmm12 = mem[0,1,1,3] 2831; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 2832; SSE-NEXT: pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] 2833; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm1[1,3] 2834; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm12[2,0] 2835; SSE-NEXT: por %xmm15, %xmm0 2836; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2837; SSE-NEXT: # xmm1 = mem[0,2,2,3] 2838; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2839; SSE-NEXT: # xmm5 = mem[0,1,1,3] 2840; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 2841; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] 2842; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm1[1,3] 2843; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,0] 2844; SSE-NEXT: por %xmm4, %xmm11 2845; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,2,2,3] 2846; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,1,1,3] 2847; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 2848; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] 2849; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[1,3] 2850; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm4[2,0] 2851; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2852; SSE-NEXT: movaps %xmm1, 16(%rsi) 2853; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2854; SSE-NEXT: movaps %xmm1, 48(%rsi) 2855; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2856; SSE-NEXT: movaps %xmm1, (%rsi) 2857; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2858; SSE-NEXT: movaps %xmm1, 32(%rsi) 2859; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2860; SSE-NEXT: movaps %xmm1, 16(%rdx) 2861; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2862; SSE-NEXT: movaps %xmm1, 48(%rdx) 2863; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2864; SSE-NEXT: movaps %xmm1, (%rdx) 2865; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2866; SSE-NEXT: movaps %xmm1, 32(%rdx) 2867; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2868; SSE-NEXT: movaps %xmm1, 16(%rcx) 2869; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2870; SSE-NEXT: movaps %xmm1, 48(%rcx) 2871; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2872; SSE-NEXT: movaps %xmm1, (%rcx) 2873; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2874; SSE-NEXT: movaps %xmm1, 32(%rcx) 2875; SSE-NEXT: movaps %xmm10, 16(%r8) 2876; SSE-NEXT: movaps %xmm9, 48(%r8) 2877; SSE-NEXT: movaps %xmm13, (%r8) 2878; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2879; SSE-NEXT: movaps %xmm1, 32(%r8) 2880; SSE-NEXT: movaps %xmm11, 16(%r9) 2881; SSE-NEXT: movaps %xmm0, 48(%r9) 2882; SSE-NEXT: movaps %xmm2, (%r9) 2883; SSE-NEXT: movaps %xmm3, 32(%r9) 2884; SSE-NEXT: addq $408, %rsp # imm = 0x198 2885; SSE-NEXT: retq 2886; 2887; AVX-LABEL: load_i16_stride5_vf32: 2888; AVX: # %bb.0: 2889; AVX-NEXT: subq $424, %rsp # imm = 0x1A8 2890; AVX-NEXT: vmovdqa 144(%rdi), %xmm9 2891; AVX-NEXT: vmovdqa 128(%rdi), %xmm7 2892; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1],xmm9[2,3],xmm7[4,5,6,7] 2893; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2894; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 2895; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 2896; AVX-NEXT: vmovdqa 96(%rdi), %xmm11 2897; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm11[0,1,1,3] 2898; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 2899; AVX-NEXT: vmovdqa 112(%rdi), %xmm10 2900; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm10[1] 2901; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2902; AVX-NEXT: vmovdqa 80(%rdi), %xmm3 2903; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2904; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 2905; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 2906; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] 2907; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] 2908; AVX-NEXT: vmovdqa (%rdi), %xmm5 2909; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2910; AVX-NEXT: vmovdqa 16(%rdi), %xmm12 2911; AVX-NEXT: vmovdqa 32(%rdi), %xmm3 2912; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2913; AVX-NEXT: vmovdqa 48(%rdi), %xmm15 2914; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[0,1,0,3] 2915; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] 2916; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3] 2917; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2918; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 2919; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3] 2920; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 2921; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 2922; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] 2923; AVX-NEXT: vmovaps {{.*#+}} ymm6 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] 2924; AVX-NEXT: vandps %ymm6, %ymm3, %ymm3 2925; AVX-NEXT: vmovaps 64(%rdi), %xmm5 2926; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm5[0,1,0,1] 2927; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2928; AVX-NEXT: vandnps %ymm4, %ymm6, %ymm4 2929; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 2930; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 2931; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2932; AVX-NEXT: vmovdqa 304(%rdi), %xmm2 2933; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2934; AVX-NEXT: vmovdqa 288(%rdi), %xmm13 2935; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3],xmm13[4,5,6,7] 2936; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2937; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 2938; AVX-NEXT: vmovdqa 256(%rdi), %xmm0 2939; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2940; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 2941; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 2942; AVX-NEXT: vmovdqa 272(%rdi), %xmm0 2943; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill 2944; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] 2945; AVX-NEXT: vmovdqa 240(%rdi), %xmm0 2946; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2947; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] 2948; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 2949; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] 2950; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4],xmm1[5,6,7] 2951; AVX-NEXT: vmovdqa 176(%rdi), %xmm0 2952; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2953; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3] 2954; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 2955; AVX-NEXT: vmovdqa 160(%rdi), %xmm0 2956; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2957; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3] 2958; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7] 2959; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 2960; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 2961; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2962; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 2963; AVX-NEXT: vmovdqa 192(%rdi), %xmm14 2964; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm14[4],xmm0[5,6,7] 2965; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2966; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 2967; AVX-NEXT: vandps %ymm6, %ymm0, %ymm0 2968; AVX-NEXT: vmovaps 224(%rdi), %xmm1 2969; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2970; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm1[0,1,0,1] 2971; AVX-NEXT: vandnps %ymm8, %ymm6, %ymm8 2972; AVX-NEXT: vorps %ymm0, %ymm8, %ymm0 2973; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 2974; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2975; AVX-NEXT: vmovdqa %xmm11, %xmm6 2976; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2977; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] 2978; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] 2979; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 2980; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 2981; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] 2982; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 2983; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] 2984; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2985; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm9[4,5],xmm7[6,7] 2986; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 2987; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 2988; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 2989; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2990; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,3,2,3] 2991; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 2992; AVX-NEXT: vpsrlq $48, %xmm12, %xmm9 2993; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1] 2994; AVX-NEXT: vmovdqa %xmm15, %xmm12 2995; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2996; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2997; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1],xmm15[2,3],xmm2[4,5],xmm15[6,7] 2998; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] 2999; AVX-NEXT: vpshufb %xmm1, %xmm9, %xmm9 3000; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5,6,7] 3001; AVX-NEXT: vmovaps {{.*#+}} ymm10 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] 3002; AVX-NEXT: vandps %ymm3, %ymm10, %ymm3 3003; AVX-NEXT: vpsllq $48, %xmm5, %xmm9 3004; AVX-NEXT: vandnps %ymm9, %ymm10, %ymm9 3005; AVX-NEXT: vorps %ymm3, %ymm9, %ymm3 3006; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 3007; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3008; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3009; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3],xmm5[4,5],xmm13[6,7] 3010; AVX-NEXT: vpshufb %xmm8, %xmm0, %xmm0 3011; AVX-NEXT: vmovdqa (%rsp), %xmm15 # 16-byte Reload 3012; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm3 # 16-byte Folded Reload 3013; AVX-NEXT: # xmm3 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] 3014; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,0,4,5,6,7] 3015; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5] 3016; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3017; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,3,2,3] 3018; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,2,2,3,4,5,6,7] 3019; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3,4,5,6,7] 3020; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5,6,7] 3021; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 3022; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm13[2,3],xmm14[4,5],xmm13[6,7] 3023; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 3024; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3025; AVX-NEXT: # xmm3 = mem[0,3,2,3] 3026; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 3027; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3028; AVX-NEXT: vpsrlq $48, %xmm8, %xmm8 3029; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1] 3030; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5,6,7] 3031; AVX-NEXT: vmovaps {{.*#+}} ymm8 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] 3032; AVX-NEXT: vandps %ymm1, %ymm8, %ymm1 3033; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 3034; AVX-NEXT: vpsllq $48, %xmm14, %xmm3 3035; AVX-NEXT: vandnps %ymm3, %ymm8, %ymm3 3036; AVX-NEXT: vorps %ymm3, %ymm1, %ymm1 3037; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 3038; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3039; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload 3040; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7] 3041; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] 3042; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 3043; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3] 3044; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 3045; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] 3046; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload 3047; AVX-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5,6,7] 3048; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 3049; AVX-NEXT: vpshufb %xmm7, %xmm3, %xmm3 3050; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 3051; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1],xmm12[2,3],xmm2[4,5,6,7] 3052; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] 3053; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 3054; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[0,1,1,3] 3055; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,7] 3056; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3057; AVX-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3] 3058; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2],xmm3[3,4,5],xmm9[6,7] 3059; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 3060; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm12[0,1,2,0] 3061; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,6,5] 3062; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] 3063; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 3064; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3065; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3066; AVX-NEXT: vmovdqa %xmm5, %xmm9 3067; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1],xmm6[2,3],xmm5[4,5,6,7] 3068; AVX-NEXT: vpshufb %xmm7, %xmm0, %xmm4 3069; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3070; AVX-NEXT: vmovdqa %xmm15, %xmm0 3071; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm5[4,5],xmm15[6,7] 3072; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 3073; AVX-NEXT: vmovdqa %xmm10, %xmm2 3074; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[3,1,2,3] 3075; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 3076; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3,4,5,6,7] 3077; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm1[0,1,2,3,4],xmm4[5,6,7] 3078; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 3079; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7] 3080; AVX-NEXT: vpshufb %xmm8, %xmm1, %xmm1 3081; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3082; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,1,1,3] 3083; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7] 3084; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3085; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3] 3086; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3,4,5],xmm3[6,7] 3087; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[0,1,2,0] 3088; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,6,5] 3089; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] 3090; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm1, %ymm1 3091; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3092; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7] 3093; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] 3094; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm0 3095; AVX-NEXT: vpsrlq $48, %xmm2, %xmm3 3096; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] 3097; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm9[0,1,2,3],xmm6[4,5],xmm9[6,7] 3098; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 3099; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3 3100; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 3101; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm4[2,3],xmm8[4,5,6,7] 3102; AVX-NEXT: vmovdqa {{.*#+}} xmm6 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] 3103; AVX-NEXT: vpshufb %xmm6, %xmm3, %xmm3 3104; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0,1,2,3],xmm13[4,5],xmm15[6,7] 3105; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[2,2,2,2,4,5,6,7] 3106; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,4,6,7] 3107; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4,5],xmm3[6,7] 3108; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,1,0,3] 3109; AVX-NEXT: vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,5,6] 3110; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm9[6,7] 3111; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 3112; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3113; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 3114; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 3115; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0,1,2,3],xmm9[4,5],xmm8[6,7] 3116; AVX-NEXT: vpshufb %xmm5, %xmm0, %xmm0 3117; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 3118; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 3119; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7] 3120; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm1 3121; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 3122; AVX-NEXT: vpsrlq $48, %xmm2, %xmm3 3123; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5,6,7] 3124; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3,4],xmm0[5,6,7] 3125; AVX-NEXT: vmovdqa %xmm11, %xmm0 3126; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 3127; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7] 3128; AVX-NEXT: vpshufb %xmm6, %xmm1, %xmm1 3129; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 3130; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 3131; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm11[4,5],xmm6[6,7] 3132; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,2,2,2,4,5,6,7] 3133; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,4,6,7] 3134; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3,4,5],xmm1[6,7] 3135; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,1,0,3] 3136; AVX-NEXT: vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] 3137; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm3[6,7] 3138; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm7 3139; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm8[3,1,2,3] 3140; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7] 3141; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] 3142; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7] 3143; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 3144; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,3,2,3] 3145; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3],xmm3[4,5,6,7] 3146; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 3147; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] 3148; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7] 3149; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3],xmm10[4,5],xmm0[6,7] 3150; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[1,1,1,1] 3151; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm11[0,2,2,3] 3152; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,0,3,4,5,6,7] 3153; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1] 3154; AVX-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] 3155; AVX-NEXT: vpshufb %xmm8, %xmm3, %xmm3 3156; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3,4,5],xmm3[6,7] 3157; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[0,1,1,3] 3158; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] 3159; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7] 3160; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1 3161; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 3162; AVX-NEXT: # xmm3 = mem[3,1,2,3] 3163; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7] 3164; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 3165; AVX-NEXT: # xmm5 = mem[0,2,2,3] 3166; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] 3167; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] 3168; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 3169; AVX-NEXT: # xmm5 = mem[0,3,2,3] 3170; AVX-NEXT: vpblendw $8, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload 3171; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] 3172; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 3173; AVX-NEXT: # xmm9 = mem[2,3,2,3] 3174; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm9[0],xmm5[1,2,3,4,5,6,7] 3175; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3],xmm3[4,5,6,7] 3176; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 3177; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 3178; AVX-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] 3179; AVX-NEXT: vpshufb %xmm8, %xmm5, %xmm5 3180; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[1,1,1,1] 3181; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3] 3182; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] 3183; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1] 3184; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2],xmm2[3,4,5],xmm5[6,7] 3185; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm14[0,1,1,3] 3186; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] 3187; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] 3188; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2 3189; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3190; AVX-NEXT: vmovaps %ymm3, 32(%rsi) 3191; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3192; AVX-NEXT: vmovaps %ymm3, (%rsi) 3193; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3194; AVX-NEXT: vmovaps %ymm3, 32(%rdx) 3195; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3196; AVX-NEXT: vmovaps %ymm3, (%rdx) 3197; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3198; AVX-NEXT: vmovaps %ymm0, 32(%rcx) 3199; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3200; AVX-NEXT: vmovaps %ymm0, (%rcx) 3201; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3202; AVX-NEXT: vmovaps %ymm0, 32(%r8) 3203; AVX-NEXT: vmovaps %ymm7, (%r8) 3204; AVX-NEXT: vmovaps %ymm2, 32(%r9) 3205; AVX-NEXT: vmovaps %ymm1, (%r9) 3206; AVX-NEXT: addq $424, %rsp # imm = 0x1A8 3207; AVX-NEXT: vzeroupper 3208; AVX-NEXT: retq 3209; 3210; AVX2-LABEL: load_i16_stride5_vf32: 3211; AVX2: # %bb.0: 3212; AVX2-NEXT: subq $264, %rsp # imm = 0x108 3213; AVX2-NEXT: vmovdqa (%rdi), %ymm1 3214; AVX2-NEXT: vmovdqa 32(%rdi), %ymm2 3215; AVX2-NEXT: vmovdqa 64(%rdi), %ymm3 3216; AVX2-NEXT: vmovdqa 96(%rdi), %ymm15 3217; AVX2-NEXT: vmovdqa 192(%rdi), %ymm4 3218; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 3219; AVX2-NEXT: vmovdqa 224(%rdi), %ymm7 3220; AVX2-NEXT: vmovdqa 256(%rdi), %ymm6 3221; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1,2],ymm7[3],ymm6[4],ymm7[5],ymm6[6,7],ymm7[8],ymm6[9,10],ymm7[11],ymm6[12],ymm7[13],ymm6[14,15] 3222; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] 3223; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6],ymm8[7] 3224; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 3225; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0 3226; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 3227; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3228; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm9 3229; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] 3230; AVX2-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 3231; AVX2-NEXT: vpshufb %xmm11, %xmm8, %xmm8 3232; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] 3233; AVX2-NEXT: vpblendvb %ymm9, %ymm8, %ymm0, %ymm8 3234; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm15[1,2],ymm3[3],ymm15[4],ymm3[5],ymm15[6,7],ymm3[8],ymm15[9,10],ymm3[11],ymm15[12],ymm3[13],ymm15[14,15] 3235; AVX2-NEXT: vpermq {{.*#+}} ymm12 = ymm0[2,3,0,1] 3236; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm12[5],ymm0[6],ymm12[7] 3237; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0 3238; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 3239; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 3240; AVX2-NEXT: vextracti128 $1, %ymm10, %xmm12 3241; AVX2-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1,2,3],xmm10[4,5],xmm12[6,7] 3242; AVX2-NEXT: vpshufb %xmm11, %xmm10, %xmm10 3243; AVX2-NEXT: vpblendvb %ymm9, %ymm10, %ymm0, %ymm12 3244; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm7[1],ymm6[2,3],ymm7[4],ymm6[5],ymm7[6],ymm6[7,8],ymm7[9],ymm6[10,11],ymm7[12],ymm6[13],ymm7[14],ymm6[15] 3245; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] 3246; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm10[5],ymm0[6,7] 3247; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 3248; AVX2-NEXT: vpshufb %ymm10, %ymm0, %ymm0 3249; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] 3250; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13 3251; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] 3252; AVX2-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 3253; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm11 3254; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm0, %ymm0 3255; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] 3256; AVX2-NEXT: vmovdqa %ymm15, %ymm5 3257; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm11[2,3,0,1] 3258; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm13[5],ymm11[6,7] 3259; AVX2-NEXT: vpshufb %ymm10, %ymm11, %ymm10 3260; AVX2-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15] 3261; AVX2-NEXT: vmovdqa %ymm2, %ymm15 3262; AVX2-NEXT: vextracti128 $1, %ymm11, %xmm13 3263; AVX2-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm13[2,3],xmm11[4,5,6],xmm13[7] 3264; AVX2-NEXT: vmovdqa 304(%rdi), %xmm13 3265; AVX2-NEXT: vpshufb %xmm14, %xmm11, %xmm11 3266; AVX2-NEXT: vmovdqa 288(%rdi), %xmm14 3267; AVX2-NEXT: vpblendvb %ymm9, %ymm11, %ymm10, %ymm9 3268; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm14[0],xmm13[1],xmm14[2,3] 3269; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 3270; AVX2-NEXT: vpshufb %xmm1, %xmm10, %xmm10 3271; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 3272; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15] 3273; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm10[4,5,6,7] 3274; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3275; AVX2-NEXT: vmovdqa 144(%rdi), %xmm11 3276; AVX2-NEXT: vmovdqa 128(%rdi), %xmm10 3277; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm10[0],xmm11[1],xmm10[2,3] 3278; AVX2-NEXT: vpshufb %xmm1, %xmm8, %xmm1 3279; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3280; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5,6,7],ymm12[8,9,10,11,12],ymm1[13,14,15] 3281; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7] 3282; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3283; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm13[2],xmm14[3] 3284; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 3285; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 3286; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3287; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 3288; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3289; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3290; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm11[2],xmm10[3] 3291; AVX2-NEXT: vpshufb %xmm8, %xmm0, %xmm0 3292; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3293; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1,2,3,4],ymm0[5,6,7],ymm9[8,9,10,11,12],ymm0[13,14,15] 3294; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] 3295; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3296; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] 3297; AVX2-NEXT: vmovdqa %ymm6, %ymm9 3298; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3299; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3300; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3301; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 3302; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3303; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15] 3304; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 3305; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] 3306; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 3307; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0 3308; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 3309; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 3310; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0],xmm14[1],xmm13[2,3] 3311; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 3312; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm1 3313; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3314; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 3315; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3316; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3317; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5],ymm5[6],ymm3[7,8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13],ymm5[14],ymm3[15] 3318; AVX2-NEXT: vmovdqa %ymm5, %ymm6 3319; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3320; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3321; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 3322; AVX2-NEXT: vpshufb %ymm8, %ymm0, %ymm0 3323; AVX2-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload 3324; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm15[0,1],ymm5[2],ymm15[3],ymm5[4],ymm15[5,6],ymm5[7],ymm15[8,9],ymm5[10],ymm15[11],ymm5[12],ymm15[13,14],ymm5[15] 3325; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 3326; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7] 3327; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 3328; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 3329; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0],xmm10[1],xmm11[2,3] 3330; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm1 3331; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3332; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 3333; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3334; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3335; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] 3336; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3337; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 3338; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] 3339; AVX2-NEXT: vmovdqa %ymm4, %ymm7 3340; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm8 3341; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3] 3342; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 3343; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 3344; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 3345; AVX2-NEXT: vpshufb %xmm8, %xmm1, %xmm1 3346; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 3347; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3] 3348; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 3349; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 3350; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3351; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 3352; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3353; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3354; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm3[1],ymm6[2],ymm3[3],ymm6[4,5],ymm3[6],ymm6[7,8],ymm3[9],ymm6[10],ymm3[11],ymm6[12,13],ymm3[14],ymm6[15] 3355; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm0[2,3,0,1] 3356; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4],ymm0[5],ymm9[6],ymm0[7] 3357; AVX2-NEXT: vpshufb %ymm12, %ymm0, %ymm0 3358; AVX2-NEXT: vpblendw {{.*#+}} ymm9 = ymm5[0],ymm15[1,2],ymm5[3],ymm15[4],ymm5[5],ymm15[6,7],ymm5[8],ymm15[9,10],ymm5[11],ymm15[12],ymm5[13],ymm15[14,15] 3359; AVX2-NEXT: vmovdqa %ymm5, %ymm1 3360; AVX2-NEXT: vextracti128 $1, %ymm9, %xmm12 3361; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] 3362; AVX2-NEXT: vpshufb %xmm8, %xmm9, %xmm8 3363; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3,4,5,6,7] 3364; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm11[0,1],xmm10[2],xmm11[3] 3365; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm2 3366; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3367; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0,1,2,3,4],ymm2[5,6,7],ymm0[8,9,10,11,12],ymm2[13,14,15] 3368; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7] 3369; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3370; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 3371; AVX2-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] 3372; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm2[2,3,0,1] 3373; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4],ymm2[5,6],ymm6[7] 3374; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm4 # 32-byte Folded Reload 3375; AVX2-NEXT: # ymm4 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] 3376; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 3377; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] 3378; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] 3379; AVX2-NEXT: # ymm5 = mem[0,1,0,1] 3380; AVX2-NEXT: vpshufb %ymm5, %ymm2, %ymm2 3381; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 3382; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 3383; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] 3384; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[3,1,2,3] 3385; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] 3386; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3] 3387; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] 3388; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 3389; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3390; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] 3391; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3392; AVX2-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] 3393; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 3394; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] 3395; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm3 3396; AVX2-NEXT: vpblendw {{.*#+}} ymm4 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] 3397; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 3398; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] 3399; AVX2-NEXT: vpshufb %xmm6, %xmm4, %xmm4 3400; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7] 3401; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm11[3,1,2,3] 3402; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] 3403; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[0,2,2,3] 3404; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] 3405; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 3406; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 3407; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7] 3408; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3409; AVX2-NEXT: vmovaps %ymm4, 32(%rsi) 3410; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3411; AVX2-NEXT: vmovaps %ymm1, (%rsi) 3412; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3413; AVX2-NEXT: vmovaps %ymm1, 32(%rdx) 3414; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3415; AVX2-NEXT: vmovaps %ymm1, (%rdx) 3416; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3417; AVX2-NEXT: vmovaps %ymm1, 32(%rcx) 3418; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3419; AVX2-NEXT: vmovaps %ymm1, (%rcx) 3420; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3421; AVX2-NEXT: vmovaps %ymm1, 32(%r8) 3422; AVX2-NEXT: vmovdqa %ymm0, (%r8) 3423; AVX2-NEXT: vmovdqa %ymm2, 32(%r9) 3424; AVX2-NEXT: vmovdqa %ymm3, (%r9) 3425; AVX2-NEXT: addq $264, %rsp # imm = 0x108 3426; AVX2-NEXT: vzeroupper 3427; AVX2-NEXT: retq 3428; 3429; AVX2-FP-LABEL: load_i16_stride5_vf32: 3430; AVX2-FP: # %bb.0: 3431; AVX2-FP-NEXT: subq $264, %rsp # imm = 0x108 3432; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm13 3433; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm6 3434; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm7 3435; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10 3436; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14 3437; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm3 3438; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm4 3439; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm5 3440; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 3441; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3442; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 3443; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] 3444; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 3445; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 3446; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm3[0],ymm14[1],ymm3[2,3],ymm14[4],ymm3[5],ymm14[6],ymm3[7,8],ymm14[9],ymm3[10,11],ymm14[12],ymm3[13],ymm14[14],ymm3[15] 3447; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3448; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 3449; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm9[1,2,3],xmm8[4,5],xmm9[6,7] 3450; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 3451; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 3452; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [65535,65535,65535,65535,65535,65535,65535,0] 3453; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm0, %ymm0 3454; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm10[1,2],ymm7[3],ymm10[4],ymm7[5],ymm10[6,7],ymm7[8],ymm10[9,10],ymm7[11],ymm10[12],ymm7[13],ymm10[14,15] 3455; AVX2-FP-NEXT: vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1] 3456; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm12[5],ymm8[6],ymm12[7] 3457; AVX2-FP-NEXT: vpshufb %ymm1, %ymm8, %ymm1 3458; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3459; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm6[1],ymm13[2,3],ymm6[4],ymm13[5],ymm6[6],ymm13[7,8],ymm6[9],ymm13[10,11],ymm6[12],ymm13[13],ymm6[14],ymm13[15] 3460; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm12 3461; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0],xmm12[1,2,3],xmm8[4,5],xmm12[6,7] 3462; AVX2-FP-NEXT: vpshufb %xmm9, %xmm8, %xmm8 3463; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm8, %ymm1, %ymm12 3464; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 3465; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] 3466; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] 3467; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 3468; AVX2-FP-NEXT: vpshufb %ymm8, %ymm1, %ymm1 3469; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm14[0],ymm3[1],ymm14[2],ymm3[3],ymm14[4,5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10],ymm3[11],ymm14[12,13],ymm3[14],ymm14[15] 3470; AVX2-FP-NEXT: vmovdqa %ymm14, %ymm5 3471; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm14 3472; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm14[2,3],xmm9[4,5,6],xmm14[7] 3473; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 3474; AVX2-FP-NEXT: vpshufb %xmm14, %xmm9, %xmm9 3475; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm9, %ymm1, %ymm1 3476; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0],ymm7[1],ymm10[2,3],ymm7[4],ymm10[5],ymm7[6],ymm10[7,8],ymm7[9],ymm10[10,11],ymm7[12],ymm10[13],ymm7[14],ymm10[15] 3477; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3478; AVX2-FP-NEXT: vpermq {{.*#+}} ymm15 = ymm9[2,3,0,1] 3479; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4],ymm15[5],ymm9[6,7] 3480; AVX2-FP-NEXT: vpshufb %ymm8, %ymm9, %ymm15 3481; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm13[1],ymm6[2],ymm13[3],ymm6[4,5],ymm13[6],ymm6[7,8],ymm13[9],ymm6[10],ymm13[11],ymm6[12,13],ymm13[14],ymm6[15] 3482; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm9 3483; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm8[0,1],xmm9[2,3],xmm8[4,5,6],xmm9[7] 3484; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm8 3485; AVX2-FP-NEXT: vpshufb %xmm14, %xmm9, %xmm14 3486; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm9 3487; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm14, %ymm15, %ymm11 3488; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm9[0],xmm8[1],xmm9[2,3] 3489; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 3490; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm14 3491; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 3492; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7],ymm0[8,9,10,11,12],ymm14[13,14,15] 3493; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7] 3494; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3495; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm6 3496; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm15 3497; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] 3498; AVX2-FP-NEXT: vpshufb %xmm2, %xmm14, %xmm2 3499; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3500; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1,2,3,4],ymm2[5,6,7],ymm12[8,9,10,11,12],ymm2[13,14,15] 3501; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm2[4,5,6,7] 3502; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3503; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm8[2],xmm9[3] 3504; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 3505; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 3506; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3507; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] 3508; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3509; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3510; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0,1],xmm6[2],xmm15[3] 3511; AVX2-FP-NEXT: vpshufb %xmm12, %xmm1, %xmm1 3512; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 3513; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm11[0,1,2,3,4],ymm1[5,6,7],ymm11[8,9,10,11,12],ymm1[13,14,15] 3514; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm1[4,5,6,7] 3515; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3516; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3517; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3518; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5],ymm3[6],ymm4[7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13],ymm3[14],ymm4[15] 3519; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] 3520; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 3521; AVX2-FP-NEXT: vmovdqu %ymm5, (%rsp) # 32-byte Spill 3522; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3523; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15] 3524; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm11 3525; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] 3526; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 3527; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 3528; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 3529; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 3530; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 3531; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3] 3532; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 3533; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 3534; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3535; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] 3536; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3537; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3538; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm10[1],ymm7[2,3],ymm10[4],ymm7[5],ymm10[6],ymm7[7,8],ymm10[9],ymm7[10,11],ymm10[12],ymm7[13],ymm10[14],ymm7[15] 3539; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm10 3540; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] 3541; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6],ymm1[7] 3542; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 3543; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3544; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm13[2],ymm7[3],ymm13[4],ymm7[5,6],ymm13[7],ymm7[8,9],ymm13[10],ymm7[11],ymm13[12],ymm7[13,14],ymm13[15] 3545; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm11 3546; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7] 3547; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 3548; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 3549; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm15[1],xmm6[2,3] 3550; AVX2-FP-NEXT: vpshufb %xmm14, %xmm2, %xmm2 3551; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3552; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] 3553; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3554; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3555; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm4[1],ymm3[2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10],ymm4[11],ymm3[12,13],ymm4[14],ymm3[15] 3556; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] 3557; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 3558; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15] 3559; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm11 3560; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3] 3561; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 3562; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 3563; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm12 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 3564; AVX2-FP-NEXT: vpshufb %xmm12, %xmm2, %xmm2 3565; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 3566; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3] 3567; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 3568; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 3569; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 3570; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] 3571; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3572; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm5 3573; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3574; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0],ymm5[1],ymm10[2],ymm5[3],ymm10[4,5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10],ymm5[11],ymm10[12,13],ymm5[14],ymm10[15] 3575; AVX2-FP-NEXT: vpermq {{.*#+}} ymm14 = ymm1[2,3,0,1] 3576; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4],ymm1[5],ymm14[6],ymm1[7] 3577; AVX2-FP-NEXT: vpshufb %ymm11, %ymm1, %ymm1 3578; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm4 3579; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0],ymm7[1,2],ymm13[3],ymm7[4],ymm13[5],ymm7[6,7],ymm13[8],ymm7[9,10],ymm13[11],ymm7[12],ymm13[13],ymm7[14,15] 3580; AVX2-FP-NEXT: vextracti128 $1, %ymm11, %xmm14 3581; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] 3582; AVX2-FP-NEXT: vpshufb %xmm12, %xmm11, %xmm11 3583; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2],ymm1[3,4,5,6,7] 3584; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm6[0,1],xmm15[2],xmm6[3] 3585; AVX2-FP-NEXT: vpshufb %xmm3, %xmm11, %xmm3 3586; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3587; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm1[0,1,2,3,4],ymm3[5,6,7],ymm1[8,9,10,11,12],ymm3[13,14,15] 3588; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7] 3589; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] 3590; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm3 3591; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] 3592; AVX2-FP-NEXT: vpshufb %xmm0, %xmm15, %xmm12 3593; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm3[0],xmm12[1],xmm3[1] 3594; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] 3595; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] 3596; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm7[4],ymm6[5,6],ymm7[7] 3597; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm13[1],ymm4[2,3],ymm13[4],ymm4[5],ymm13[6],ymm4[7,8],ymm13[9],ymm4[10,11],ymm13[12],ymm4[13],ymm13[14],ymm4[15] 3598; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm5 3599; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] 3600; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] 3601; AVX2-FP-NEXT: # ymm5 = mem[0,1,0,1] 3602; AVX2-FP-NEXT: vpshufb %ymm5, %ymm6, %ymm6 3603; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 3604; AVX2-FP-NEXT: vpshufb %xmm7, %xmm4, %xmm4 3605; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm6[3,4,5,6,7] 3606; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 3607; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7] 3608; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3609; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 3610; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] 3611; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm4[2,3,0,1] 3612; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm6[4],ymm4[5,6],ymm6[7] 3613; AVX2-FP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 3614; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm5 # 32-byte Reload 3615; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 3616; AVX2-FP-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] 3617; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm6 3618; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] 3619; AVX2-FP-NEXT: vpshufb %xmm7, %xmm5, %xmm5 3620; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] 3621; AVX2-FP-NEXT: vpshufb %xmm11, %xmm8, %xmm5 3622; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm0 3623; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 3624; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3625; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7] 3626; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3627; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi) 3628; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3629; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi) 3630; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3631; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx) 3632; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3633; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) 3634; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3635; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx) 3636; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3637; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx) 3638; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%r8) 3639; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8) 3640; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r9) 3641; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9) 3642; AVX2-FP-NEXT: addq $264, %rsp # imm = 0x108 3643; AVX2-FP-NEXT: vzeroupper 3644; AVX2-FP-NEXT: retq 3645; 3646; AVX2-FCP-LABEL: load_i16_stride5_vf32: 3647; AVX2-FCP: # %bb.0: 3648; AVX2-FCP-NEXT: subq $296, %rsp # imm = 0x128 3649; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm15 3650; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm1 3651; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3 3652; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm14 3653; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm4 3654; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 3655; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm6 3656; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 3657; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm7[1,2],ymm6[3],ymm7[4],ymm6[5],ymm7[6,7],ymm6[8],ymm7[9,10],ymm6[11],ymm7[12],ymm6[13],ymm7[14,15] 3658; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3659; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3660; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,0,2,4,6,1,3] 3661; AVX2-FCP-NEXT: vpermd %ymm8, %ymm10, %ymm8 3662; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] 3663; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm8, %ymm8 3664; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 3665; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm12 3666; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1,2,3],xmm9[4,5],xmm12[6,7] 3667; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 3668; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm9, %xmm13 3669; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [65535,65535,65535,65535,65535,65535,65535,0] 3670; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm8, %ymm8 3671; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm14[0],ymm3[1],ymm14[2,3],ymm3[4],ymm14[5],ymm3[6],ymm14[7,8],ymm3[9],ymm14[10,11],ymm3[12],ymm14[13],ymm3[14],ymm14[15] 3672; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm0 3673; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3674; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 3675; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm14[1,2,3],xmm13[4,5],xmm14[6,7] 3676; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm13, %xmm12 3677; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm15[0],ymm1[1,2],ymm15[3],ymm1[4],ymm15[5],ymm1[6,7],ymm15[8],ymm1[9,10],ymm15[11],ymm1[12],ymm15[13],ymm1[14,15] 3678; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 3679; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm2 3680; AVX2-FCP-NEXT: vpermd %ymm13, %ymm10, %ymm10 3681; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm10, %ymm10 3682; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm12, %ymm10, %ymm11 3683; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15] 3684; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [2,0,0,0,4,7,1,6] 3685; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 3686; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] 3687; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 3688; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 3689; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7 3690; AVX2-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 3691; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3],xmm13[4,5,6],xmm15[7] 3692; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm15 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 3693; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm13, %xmm13 3694; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm13, %ymm10, %ymm13 3695; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm3[0],ymm0[1],ymm3[2],ymm0[3],ymm3[4,5],ymm0[6],ymm3[7,8],ymm0[9],ymm3[10],ymm0[11],ymm3[12,13],ymm0[14],ymm3[15] 3696; AVX2-FCP-NEXT: vextracti128 $1, %ymm10, %xmm0 3697; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6],xmm0[7] 3698; AVX2-FCP-NEXT: vpshufb %xmm15, %xmm0, %xmm0 3699; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 3700; AVX2-FCP-NEXT: vmovdqa %ymm2, %ymm15 3701; AVX2-FCP-NEXT: vpermd %ymm10, %ymm12, %ymm10 3702; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12 3703; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm10, %ymm10 3704; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,3,1,3,0,3,5,7] 3705; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm0, %ymm10, %ymm0 3706; AVX2-FCP-NEXT: vpermd %ymm12, %ymm14, %ymm9 3707; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] 3708; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm9, %ymm9 3709; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm9 = ymm8[0,1,2,3,4],ymm9[5,6,7],ymm8[8,9,10,11,12],ymm9[13,14,15] 3710; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm9[4,5,6,7] 3711; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3712; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 3713; AVX2-FCP-NEXT: vpermd %ymm10, %ymm14, %ymm8 3714; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm8 3715; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0,1,2,3,4],ymm8[5,6,7],ymm11[8,9,10,11,12],ymm8[13,14,15] 3716; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm8[4,5,6,7] 3717; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3718; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,3,2,3,1,3,6,7] 3719; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm11 3720; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] 3721; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm11 3722; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5,6,7],ymm13[8,9,10,11,12],ymm11[13,14,15] 3723; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm11[4,5,6,7] 3724; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3725; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm8 3726; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 3727; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm0[0,1,2,3,4],ymm8[5,6,7],ymm0[8,9,10,11,12],ymm8[13,14,15] 3728; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7] 3729; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3730; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] 3731; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm9 3732; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3733; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3734; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8 3735; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm8[3,4],xmm0[5,6,7] 3736; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3737; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3738; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 3739; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,2,0,0,5,7,2,4] 3740; AVX2-FCP-NEXT: vpermd %ymm8, %ymm14, %ymm8 3741; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 3742; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm8, %ymm8 3743; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 3744; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm0, %xmm0 3745; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7] 3746; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0] 3747; AVX2-FCP-NEXT: # ymm8 = mem[0,1,0,1] 3748; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm11 3749; AVX2-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm11 3750; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm0[0,1,2,3,4],ymm11[5,6,7],ymm0[8,9,10,11,12],ymm11[13,14,15] 3751; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7] 3752; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3753; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm6 3754; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3755; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm3[0,1],ymm0[2],ymm3[3],ymm0[4],ymm3[5,6],ymm0[7],ymm3[8,9],ymm0[10],ymm3[11],ymm0[12],ymm3[13,14],ymm0[15] 3756; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm13 3757; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm13[3,4],xmm11[5,6,7] 3758; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm11, %xmm1 3759; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm3 # 32-byte Reload 3760; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm15[0],ymm3[1],ymm15[2,3],ymm3[4],ymm15[5],ymm3[6],ymm15[7,8],ymm3[9],ymm15[10,11],ymm3[12],ymm15[13],ymm3[14],ymm15[15] 3761; AVX2-FCP-NEXT: vpermd %ymm11, %ymm14, %ymm11 3762; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm11, %ymm2 3763; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 3764; AVX2-FCP-NEXT: vpermd %ymm10, %ymm8, %ymm2 3765; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] 3766; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1,2,3,4],ymm2[5,6,7],ymm1[8,9,10,11,12],ymm2[13,14,15] 3767; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7] 3768; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3769; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] 3770; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 3771; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 3772; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15] 3773; AVX2-FCP-NEXT: vmovdqa %ymm4, %ymm7 3774; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,3,0,0,5,0,2,7] 3775; AVX2-FCP-NEXT: vpermd %ymm2, %ymm11, %ymm2 3776; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 3777; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm2, %ymm2 3778; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm14 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 3779; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm1, %xmm1 3780; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7] 3781; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0] 3782; AVX2-FCP-NEXT: # ymm2 = mem[0,1,0,1] 3783; AVX2-FCP-NEXT: vpermd %ymm12, %ymm2, %ymm8 3784; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] 3785; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 3786; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm1[0,1,2,3,4],ymm8[5,6,7],ymm1[8,9,10,11,12],ymm8[13,14,15] 3787; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7] 3788; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3789; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm6[1,2],ymm0[3],ymm6[4],ymm0[5],ymm6[6,7],ymm0[8],ymm6[9,10],ymm0[11],ymm6[12],ymm0[13],ymm6[14,15] 3790; AVX2-FCP-NEXT: vmovdqa %ymm6, %ymm8 3791; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm0 3792; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 3793; AVX2-FCP-NEXT: vpshufb %xmm14, %xmm0, %xmm0 3794; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm3[0],ymm15[1],ymm3[2],ymm15[3],ymm3[4,5],ymm15[6],ymm3[7,8],ymm15[9],ymm3[10],ymm15[11],ymm3[12,13],ymm15[14],ymm3[15] 3795; AVX2-FCP-NEXT: vmovdqa %ymm3, %ymm9 3796; AVX2-FCP-NEXT: vmovdqa %ymm15, %ymm14 3797; AVX2-FCP-NEXT: vpermd %ymm1, %ymm11, %ymm1 3798; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm1, %ymm1 3799; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 3800; AVX2-FCP-NEXT: vpermd %ymm10, %ymm2, %ymm1 3801; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1 3802; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 3803; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7] 3804; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3],ymm5[4],ymm7[5,6],ymm5[7],ymm7[8,9],ymm5[10],ymm7[11],ymm5[12],ymm7[13,14],ymm5[15] 3805; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3806; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 3807; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] 3808; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2 3809; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7] 3810; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [1,3,0,0,6,0,3,5] 3811; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0 3812; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] 3813; AVX2-FCP-NEXT: # ymm4 = mem[0,1,0,1] 3814; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 3815; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 3816; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1 3817; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7] 3818; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,1,3,0,2,5,7] 3819; AVX2-FCP-NEXT: vpermd %ymm12, %ymm1, %ymm6 3820; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] 3821; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm6, %ymm6 3822; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7] 3823; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm3 # 32-byte Folded Reload 3824; AVX2-FCP-NEXT: # ymm3 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] 3825; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 3826; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] 3827; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm3 3828; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0,1],ymm14[2],ymm9[3],ymm14[4],ymm9[5,6],ymm14[7],ymm9[8,9],ymm14[10],ymm9[11],ymm14[12],ymm9[13,14],ymm14[15] 3829; AVX2-FCP-NEXT: vpermd %ymm5, %ymm2, %ymm2 3830; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 3831; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7] 3832; AVX2-FCP-NEXT: vpermd %ymm10, %ymm1, %ymm1 3833; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm1 3834; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7] 3835; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3836; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) 3837; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3838; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) 3839; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3840; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx) 3841; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3842; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx) 3843; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3844; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx) 3845; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3846; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) 3847; AVX2-FCP-NEXT: vmovdqa %ymm15, 32(%r8) 3848; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3849; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8) 3850; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r9) 3851; AVX2-FCP-NEXT: vmovdqa %ymm0, (%r9) 3852; AVX2-FCP-NEXT: addq $296, %rsp # imm = 0x128 3853; AVX2-FCP-NEXT: vzeroupper 3854; AVX2-FCP-NEXT: retq 3855; 3856; AVX512-LABEL: load_i16_stride5_vf32: 3857; AVX512: # %bb.0: 3858; AVX512-NEXT: vmovdqa 256(%rdi), %ymm0 3859; AVX512-NEXT: vmovdqa 288(%rdi), %ymm1 3860; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 3861; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 3862; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] 3863; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] 3864; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 3865; AVX512-NEXT: vmovdqa 192(%rdi), %ymm3 3866; AVX512-NEXT: vmovdqa 224(%rdi), %ymm9 3867; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15] 3868; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 3869; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] 3870; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] 3871; AVX512-NEXT: vpshufb %ymm7, %ymm5, %ymm5 3872; AVX512-NEXT: vmovdqa64 176(%rdi), %xmm20 3873; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] 3874; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] 3875; AVX512-NEXT: vmovdqa 160(%rdi), %xmm6 3876; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] 3877; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] 3878; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] 3879; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] 3880; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7] 3881; AVX512-NEXT: vmovdqa (%rdi), %ymm8 3882; AVX512-NEXT: vmovdqa 32(%rdi), %ymm10 3883; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4 3884; AVX512-NEXT: vmovdqa 96(%rdi), %ymm5 3885; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 3886; AVX512-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 3887; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7] 3888; AVX512-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero 3889; AVX512-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15] 3890; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm13 3891; AVX512-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] 3892; AVX512-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] 3893; AVX512-NEXT: vpor %ymm11, %ymm12, %ymm15 3894; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 3895; AVX512-NEXT: vmovdqa 128(%rdi), %xmm12 3896; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3] 3897; AVX512-NEXT: vpshufb %xmm7, %xmm13, %xmm7 3898; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3899; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 3900; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15)) 3901; AVX512-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 3902; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] 3903; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm14 3904; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] 3905; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] 3906; AVX512-NEXT: vpshufb %ymm14, %ymm7, %ymm7 3907; AVX512-NEXT: vpsrlq $48, %xmm20, %xmm15 3908; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] 3909; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] 3910; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] 3911; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] 3912; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] 3913; AVX512-NEXT: vmovdqa %ymm0, %ymm2 3914; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 3915; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm15 3916; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] 3917; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] 3918; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 3919; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] 3920; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 3921; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] 3922; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] 3923; AVX512-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15] 3924; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm0 3925; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7] 3926; AVX512-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero 3927; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u] 3928; AVX512-NEXT: vpor %ymm0, %ymm13, %ymm0 3929; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] 3930; AVX512-NEXT: vpshufb %xmm14, %xmm13, %xmm13 3931; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 3932; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0)) 3933; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 3934; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] 3935; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm7 3936; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7] 3937; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] 3938; AVX512-NEXT: vpshufb %ymm7, %ymm0, %ymm0 3939; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21 3940; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3] 3941; AVX512-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] 3942; AVX512-NEXT: vmovdqa64 %xmm20, %xmm15 3943; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3] 3944; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7] 3945; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] 3946; AVX512-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 3947; AVX512-NEXT: vextracti128 $1, %ymm7, %xmm13 3948; AVX512-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7] 3949; AVX512-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] 3950; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 3951; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] 3952; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15] 3953; AVX512-NEXT: vmovdqa64 %ymm9, %ymm20 3954; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm13 3955; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7] 3956; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] 3957; AVX512-NEXT: vpshufb %ymm13, %ymm0, %ymm0 3958; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] 3959; AVX512-NEXT: vmovdqa64 %xmm15, %xmm22 3960; AVX512-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] 3961; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7] 3962; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] 3963; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3] 3964; AVX512-NEXT: vpshufb %xmm13, %xmm14, %xmm13 3965; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 3966; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 3967; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] 3968; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 3969; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3] 3970; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 3971; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] 3972; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 3973; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 3974; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 3975; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 3976; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 3977; AVX512-NEXT: vmovdqa %ymm2, %ymm9 3978; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm15 3979; AVX512-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] 3980; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0)) 3981; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 3982; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 3983; AVX512-NEXT: vextracti64x4 $1, %zmm13, %ymm14 3984; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] 3985; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] 3986; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm17 3987; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] 3988; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 3989; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3] 3990; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] 3991; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] 3992; AVX512-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15] 3993; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm14 3994; AVX512-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] 3995; AVX512-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 3996; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] 3997; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] 3998; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 3999; AVX512-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 4000; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 4001; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] 4002; AVX512-NEXT: vmovdqa64 %ymm21, %ymm2 4003; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11 4004; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 4005; AVX512-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13)) 4006; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 4007; AVX512-NEXT: vmovdqa64 %ymm20, %ymm2 4008; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] 4009; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 4010; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7] 4011; AVX512-NEXT: vmovdqa64 %xmm22, %xmm3 4012; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] 4013; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] 4014; AVX512-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 4015; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] 4016; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 4017; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4018; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 4019; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] 4020; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 4021; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 4022; AVX512-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] 4023; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 4024; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] 4025; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 4026; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] 4027; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 4028; AVX512-NEXT: movb $7, %al 4029; AVX512-NEXT: kmovw %eax, %k1 4030; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1} 4031; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15] 4032; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2 4033; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] 4034; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm2 4035; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 4036; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4037; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4038; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 4039; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4040; AVX512-NEXT: vmovdqa64 %zmm16, (%rsi) 4041; AVX512-NEXT: vmovdqa64 %zmm19, (%rdx) 4042; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx) 4043; AVX512-NEXT: vmovdqa64 %zmm17, (%r8) 4044; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) 4045; AVX512-NEXT: vzeroupper 4046; AVX512-NEXT: retq 4047; 4048; AVX512-FCP-LABEL: load_i16_stride5_vf32: 4049; AVX512-FCP: # %bb.0: 4050; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm2 4051; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] 4052; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 4053; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] 4054; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4055; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 4056; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 4057; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 4058; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] 4059; AVX512-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 4060; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 4061; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] 4062; AVX512-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 4063; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 4064; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 4065; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 4066; AVX512-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 4067; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] 4068; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] 4069; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4070; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] 4071; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm10 4072; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 4073; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 4074; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 4075; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] 4076; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] 4077; AVX512-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 4078; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero 4079; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] 4080; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 4081; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] 4082; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] 4083; AVX512-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 4084; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] 4085; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 4086; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 4087; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] 4088; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 4089; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 4090; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12)) 4091; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 4092; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] 4093; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 4094; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7] 4095; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 4096; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 4097; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] 4098; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] 4099; AVX512-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 4100; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero 4101; AVX512-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 4102; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] 4103; AVX512-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 4104; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] 4105; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 4106; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12)) 4107; AVX512-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 4108; AVX512-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 4109; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] 4110; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 4111; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] 4112; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 4113; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] 4114; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] 4115; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] 4116; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 4117; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] 4118; AVX512-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] 4119; AVX512-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 4120; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] 4121; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] 4122; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17 4123; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 4124; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 4125; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7] 4126; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] 4127; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4128; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] 4129; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] 4130; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 4131; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] 4132; AVX512-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 4133; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] 4134; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] 4135; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] 4136; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] 4137; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] 4138; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 4139; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] 4140; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 4141; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] 4142; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] 4143; AVX512-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 4144; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 4145; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] 4146; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] 4147; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] 4148; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 4149; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 4150; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) 4151; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 4152; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] 4153; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 4154; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] 4155; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 4156; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] 4157; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] 4158; AVX512-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 4159; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 4160; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] 4161; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] 4162; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] 4163; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] 4164; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] 4165; AVX512-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 4166; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] 4167; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] 4168; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] 4169; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] 4170; AVX512-FCP-NEXT: # ymm15 = mem[0,1,0,1] 4171; AVX512-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 4172; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 4173; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 4174; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6)) 4175; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 4176; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 4177; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 4178; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] 4179; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 4180; AVX512-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4181; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] 4182; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] 4183; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 4184; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 4185; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 4186; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] 4187; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 4188; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] 4189; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] 4190; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 4191; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] 4192; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] 4193; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] 4194; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 4195; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 4196; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] 4197; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 4198; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] 4199; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] 4200; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 4201; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] 4202; AVX512-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 4203; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] 4204; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 4205; AVX512-FCP-NEXT: movb $7, %al 4206; AVX512-FCP-NEXT: kmovw %eax, %k1 4207; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} 4208; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3 4209; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 4210; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 4211; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] 4212; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 4213; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4214; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] 4215; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 4216; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 4217; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) 4218; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 4219; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) 4220; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r8) 4221; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r9) 4222; AVX512-FCP-NEXT: vzeroupper 4223; AVX512-FCP-NEXT: retq 4224; 4225; AVX512DQ-LABEL: load_i16_stride5_vf32: 4226; AVX512DQ: # %bb.0: 4227; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm0 4228; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm1 4229; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 4230; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 4231; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4],xmm2[5,6,7] 4232; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] 4233; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm4 4234; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm3 4235; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm9 4236; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15] 4237; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 4238; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7] 4239; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] 4240; AVX512DQ-NEXT: vpshufb %ymm7, %ymm5, %ymm5 4241; AVX512DQ-NEXT: vmovdqa64 176(%rdi), %xmm20 4242; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm20[3,1,2,3] 4243; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7] 4244; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm6 4245; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3] 4246; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,3,2,3,4,5,6,7] 4247; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm10[0],xmm8[0],xmm10[1],xmm8[1] 4248; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3,4,5,6,7] 4249; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3,4],ymm4[5,6,7] 4250; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm8 4251; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm10 4252; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4 4253; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm5 4254; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 4255; AVX512DQ-NEXT: vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1] 4256; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6],ymm12[7] 4257; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm11[6,7,16,17,26,27,20,21,30,31,24,25],zero,zero,zero,zero,zero,zero 4258; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm12 = ymm8[0],ymm10[1],ymm8[2,3],ymm10[4],ymm8[5],ymm10[6],ymm8[7,8],ymm10[9],ymm8[10,11],ymm10[12],ymm8[13],ymm10[14],ymm8[15] 4259; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm13 4260; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] 4261; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] 4262; AVX512DQ-NEXT: vpor %ymm11, %ymm12, %ymm15 4263; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 4264; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm12 4265; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0],xmm11[1],xmm12[2,3] 4266; AVX512DQ-NEXT: vpshufb %xmm7, %xmm13, %xmm7 4267; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4268; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 4269; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm18 & (zmm7 ^ zmm15)) 4270; AVX512DQ-NEXT: vinserti64x4 $1, %ymm14, %zmm7, %zmm16 4271; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm3[0],ymm9[1],ymm3[2,3],ymm9[4],ymm3[5],ymm9[6],ymm3[7,8],ymm9[9],ymm3[10,11],ymm9[12],ymm3[13],ymm9[14],ymm3[15] 4272; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm14 4273; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2],ymm14[3],ymm7[4,5,6,7] 4274; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] 4275; AVX512DQ-NEXT: vpshufb %ymm14, %ymm7, %ymm7 4276; AVX512DQ-NEXT: vpsrlq $48, %xmm20, %xmm15 4277; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[0,3,2,3] 4278; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,2,2,3,4,5,6,7] 4279; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] 4280; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm7[3,4,5,6,7] 4281; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7] 4282; AVX512DQ-NEXT: vmovdqa %ymm0, %ymm2 4283; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 4284; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm15 4285; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2],xmm13[3] 4286; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,u,u,6,7,0,1,10,11,4,5,14,15,8,9] 4287; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4288; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm13[5,6,7] 4289; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 4290; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm13[2,3,0,1] 4291; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7] 4292; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm15 = ymm10[0],ymm8[1],ymm10[2],ymm8[3],ymm10[4,5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10],ymm8[11],ymm10[12,13],ymm8[14],ymm10[15] 4293; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm0 4294; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3],xmm15[4,5,6],xmm0[7] 4295; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[8,9,18,19,28,29,22,23,16,17,26,27],zero,zero,zero,zero,zero,zero 4296; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[2,3,12,13,6,7,0,1,10,11,4,5,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[u,u,u,u,u,u] 4297; AVX512DQ-NEXT: vpor %ymm0, %ymm13, %ymm0 4298; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm12[0,1],xmm11[2],xmm12[3] 4299; AVX512DQ-NEXT: vpshufb %xmm14, %xmm13, %xmm13 4300; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4301; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm0)) 4302; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm19 4303; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm3[1],ymm9[2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10],ymm3[11],ymm9[12,13],ymm3[14],ymm9[15] 4304; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm7 4305; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[2,3,4,5,6,7] 4306; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] 4307; AVX512DQ-NEXT: vpshufb %ymm7, %ymm0, %ymm0 4308; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21 4309; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,1,1,3] 4310; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7] 4311; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm15 4312; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm7[2],xmm20[2],xmm7[3],xmm20[3] 4313; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm0[3,4,5,6,7] 4314; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] 4315; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] 4316; AVX512DQ-NEXT: vextracti128 $1, %ymm7, %xmm13 4317; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm7 = xmm13[0,1,2],xmm7[3,4],xmm13[5,6,7] 4318; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] 4319; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4320; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] 4321; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm3[2],ymm9[3],ymm3[4],ymm9[5,6],ymm3[7],ymm9[8,9],ymm3[10],ymm9[11],ymm3[12],ymm9[13,14],ymm3[15] 4322; AVX512DQ-NEXT: vmovdqa64 %ymm9, %ymm20 4323; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm13 4324; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm13[2],ymm0[3,4,5,6,7] 4325; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] 4326; AVX512DQ-NEXT: vpshufb %ymm13, %ymm0, %ymm0 4327; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0],xmm6[1],xmm15[2,3] 4328; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm22 4329; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] 4330; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1,2],xmm0[3,4,5,6,7] 4331; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] 4332; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm11[0,1],xmm12[2],xmm11[3] 4333; AVX512DQ-NEXT: vpshufb %xmm13, %xmm14, %xmm13 4334; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4335; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm0 4336; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm8[0],ymm10[1,2],ymm8[3],ymm10[4],ymm8[5],ymm10[6,7],ymm8[8],ymm10[9,10],ymm8[11],ymm10[12],ymm8[13],ymm10[14,15] 4337; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 4338; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm14[0],xmm13[1],xmm14[2],xmm13[3] 4339; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 4340; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] 4341; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4],ymm14[5],ymm15[6],ymm14[7] 4342; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 4343; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 4344; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 4345; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15] 4346; AVX512DQ-NEXT: vmovdqa %ymm2, %ymm9 4347; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm15 4348; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0],xmm15[1,2,3],xmm14[4,5],xmm15[6,7] 4349; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm0 ^ (mem & (zmm13 ^ zmm0)) 4350; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm14[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 4351; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4352; AVX512DQ-NEXT: vextracti64x4 $1, %zmm13, %ymm14 4353; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm0[1,2,3,4,5,6,7],ymm14[8],ymm0[9,10,11,12,13,14,15] 4354; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7] 4355; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm13, %zmm17 4356; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] 4357; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 4358; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm12[0,2,2,3] 4359; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] 4360; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] 4361; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm13 = ymm10[0,1],ymm8[2],ymm10[3],ymm8[4],ymm10[5,6],ymm8[7],ymm10[8,9],ymm8[10],ymm10[11],ymm8[12],ymm10[13,14],ymm8[15] 4362; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm14 4363; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0,1,2],xmm14[3,4],xmm13[5,6,7] 4364; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm14 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 4365; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm14[2,3,0,1] 4366; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6],ymm14[7] 4367; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 4368; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 4369; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2],ymm14[3,4,5,6,7] 4370; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3] 4371; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm2 4372; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11 4373; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 4374; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm11 = zmm11 ^ (zmm18 & (zmm11 ^ zmm13)) 4375; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm11, %zmm7 4376; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm2 4377; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2],ymm3[3],ymm2[4],ymm3[5],ymm2[6,7],ymm3[8],ymm2[9,10],ymm3[11],ymm2[12],ymm3[13],ymm2[14,15] 4378; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 4379; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3,4,5,6,7] 4380; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3 4381; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm6[2],xmm3[3] 4382; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] 4383; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 4384; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[3,4,5,6,7] 4385; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 4386; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4387; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 4388; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] 4389; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 4390; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 4391; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] 4392; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 4393; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6],ymm4[7] 4394; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 4395; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] 4396; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3,4,5,6,7] 4397; AVX512DQ-NEXT: movb $7, %al 4398; AVX512DQ-NEXT: kmovw %eax, %k1 4399; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k1} 4400; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2],ymm9[3],ymm1[4,5],ymm9[6],ymm1[7,8],ymm9[9],ymm1[10],ymm9[11],ymm1[12,13],ymm9[14],ymm1[15] 4401; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm2 4402; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6],xmm2[7] 4403; AVX512DQ-NEXT: vextracti64x4 $1, %zmm0, %ymm2 4404; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 4405; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 4406; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] 4407; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7] 4408; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 4409; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rsi) 4410; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rdx) 4411; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx) 4412; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%r8) 4413; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) 4414; AVX512DQ-NEXT: vzeroupper 4415; AVX512DQ-NEXT: retq 4416; 4417; AVX512DQ-FCP-LABEL: load_i16_stride5_vf32: 4418; AVX512DQ-FCP: # %bb.0: 4419; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm2 4420; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[4,5,14,15,4,5,6,7,u,u,u,u,u,u,u,u] 4421; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm3 4422; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm3[0,1,10,11,8,9,10,11,u,u,u,u,u,u,u,u] 4423; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 4424; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm4 4425; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5 4426; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 4427; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [2,4,7,1,4,6,0,0] 4428; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm1 4429; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u] 4430; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [8,9,3,2,4,0,0,0] 4431; AVX512DQ-FCP-NEXT: vpermi2d %ymm0, %ymm1, %ymm6 4432; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm0 4433; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm1 4434; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 4435; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm7, %xmm8 4436; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm8[3,4],xmm7[5,6,7] 4437; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,4,5,14,15,8,9,2,3,12,13,6,7] 4438; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 4439; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] 4440; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm10 4441; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 4442; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 4443; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm9 4444; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] 4445; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm12 = [1,0,0,0,4,6,1,3] 4446; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm12, %ymm7 4447; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero 4448; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15] 4449; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 4450; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7] 4451; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u] 4452; AVX512DQ-FCP-NEXT: vpor %ymm7, %ymm12, %ymm12 4453; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [0,3,1,0,0,3,5,0] 4454; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm7 4455; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm13 4456; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] 4457; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm13, %ymm13 4458; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm18 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 4459; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm18 & (zmm13 ^ zmm12)) 4460; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm13, %zmm16 4461; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15] 4462; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm13 4463; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7] 4464; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 4465; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm12 4466; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15] 4467; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] 4468; AVX512DQ-FCP-NEXT: vpermd %ymm13, %ymm17, %ymm13 4469; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero 4470; AVX512DQ-FCP-NEXT: vpor %ymm13, %ymm12, %ymm12 4471; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm13 = [1,3,2,0,1,3,6,0] 4472; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm13, %ymm15 4473; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] 4474; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm15 4475; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm15 = zmm15 ^ (zmm18 & (zmm15 ^ zmm12)) 4476; AVX512DQ-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm6 4477; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm2, %xmm12 4478; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1] 4479; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 4480; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [0,2,5,7,4,7,0,0] 4481; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 4482; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] 4483; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7] 4484; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7] 4485; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 4486; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm17 = [1,4,6,3,1,4,6,3] 4487; AVX512DQ-FCP-NEXT: # ymm17 = mem[0,1,2,3,0,1,2,3] 4488; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm17, %ymm12 4489; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,16,17,30,31,24,25] 4490; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm12[5,6,7] 4491; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm15, %zmm17 4492; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 4493; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm15 4494; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3,4],xmm15[5,6,7] 4495; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[u,u,u,u,8,9,2,3,12,13,6,7,0,1,10,11] 4496; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 4497; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] 4498; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3] 4499; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 4500; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,3,5,2,5,7,0,0] 4501; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm19, %ymm12 4502; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] 4503; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7] 4504; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm15[0,1,2,3],ymm12[4,5,6,7] 4505; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4],ymm6[5,6,7] 4506; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15] 4507; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 4508; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7] 4509; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 4510; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15] 4511; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm19 = [0,2,0,0,5,7,2,4] 4512; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm19, %ymm15 4513; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 4514; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7] 4515; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [1,4,6,0,1,4,6,0] 4516; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] 4517; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 4518; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm15, %ymm14 4519; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm18 & (zmm14 ^ zmm12)) 4520; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm14, %zmm14 4521; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] 4522; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm12 4523; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3] 4524; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 4525; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15] 4526; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,3,0,0,5,0,2,7] 4527; AVX512DQ-FCP-NEXT: vpermd %ymm12, %ymm15, %ymm12 4528; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 4529; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7] 4530; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3] 4531; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u] 4532; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15] 4533; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] 4534; AVX512DQ-FCP-NEXT: vpermd %ymm15, %ymm18, %ymm15 4535; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] 4536; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7] 4537; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm15[4,5,6,7] 4538; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [2,4,7,0,2,4,7,0] 4539; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,0,1] 4540; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm15, %ymm15 4541; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm15, %ymm13 4542; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm13, %zmm12 4543; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 ^ (mem & (zmm12 ^ zmm6)) 4544; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm12, %ymm6 4545; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 4546; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 4547; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm13 = xmm13[0],xmm15[1,2,3],xmm13[4,5],xmm15[6,7] 4548; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 4549; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 4550; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm13 = ymm6[0],ymm13[1,2,3,4,5,6,7],ymm6[8],ymm13[9,10,11,12,13,14,15] 4551; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm13[4,5,6,7] 4552; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm12, %zmm6 4553; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 4554; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm11 4555; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7] 4556; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u] 4557; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] 4558; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,0,0,6,0,3,5] 4559; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8 4560; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] 4561; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7] 4562; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3] 4563; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u] 4564; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 4565; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,4,6,3,6,0,0,0] 4566; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 4567; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] 4568; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] 4569; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 4570; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,1,3,0,2,5,7] 4571; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm3, %ymm3 4572; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] 4573; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm2 4574; AVX512DQ-FCP-NEXT: movb $7, %al 4575; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 4576; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm0, %zmm2 {%k1} 4577; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm2, %ymm3 4578; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15] 4579; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 4580; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] 4581; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 4582; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 4583; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15] 4584; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 4585; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 4586; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rsi) 4587; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rdx) 4588; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rcx) 4589; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r8) 4590; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r9) 4591; AVX512DQ-FCP-NEXT: vzeroupper 4592; AVX512DQ-FCP-NEXT: retq 4593; 4594; AVX512BW-LABEL: load_i16_stride5_vf32: 4595; AVX512BW: # %bb.0: 4596; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm0 4597; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1 4598; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2 4599; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3 4600; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4 4601; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 4602; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 4603; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 4604; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 4605; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 4606; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 4607; AVX512BW-NEXT: kmovd %eax, %k1 4608; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 4609; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 4610; AVX512BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 4611; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 4612; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4613; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 4614; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 4615; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 4616; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} 4617; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 4618; AVX512BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 4619; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 4620; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4621; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 4622; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 4623; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 4624; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} 4625; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 4626; AVX512BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 4627; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 4628; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4629; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 4630; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 4631; AVX512BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 4632; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 4633; AVX512BW-NEXT: kmovd %eax, %k1 4634; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 4635; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 4636; AVX512BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 4637; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 4638; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 4639; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 4640; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 4641; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 4642; AVX512BW-NEXT: movb $7, %al 4643; AVX512BW-NEXT: kmovd %eax, %k1 4644; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 4645; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 4646; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 4647; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rsi) 4648; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rdx) 4649; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx) 4650; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r8) 4651; AVX512BW-NEXT: vmovdqa64 %zmm2, (%r9) 4652; AVX512BW-NEXT: vzeroupper 4653; AVX512BW-NEXT: retq 4654; 4655; AVX512BW-FCP-LABEL: load_i16_stride5_vf32: 4656; AVX512BW-FCP: # %bb.0: 4657; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 4658; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 4659; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 4660; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 4661; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 4662; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 4663; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 4664; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 4665; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 4666; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 4667; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 4668; AVX512BW-FCP-NEXT: kmovd %eax, %k1 4669; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 4670; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 4671; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 4672; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 4673; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4674; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 4675; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 4676; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 4677; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} 4678; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 4679; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 4680; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 4681; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4682; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 4683; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 4684; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 4685; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} 4686; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 4687; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 4688; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 4689; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4690; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 4691; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 4692; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 4693; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 4694; AVX512BW-FCP-NEXT: kmovd %eax, %k1 4695; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 4696; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 4697; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 4698; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 4699; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 4700; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 4701; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 4702; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 4703; AVX512BW-FCP-NEXT: movb $7, %al 4704; AVX512BW-FCP-NEXT: kmovd %eax, %k1 4705; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 4706; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 4707; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 4708; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) 4709; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) 4710; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) 4711; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) 4712; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) 4713; AVX512BW-FCP-NEXT: vzeroupper 4714; AVX512BW-FCP-NEXT: retq 4715; 4716; AVX512DQ-BW-LABEL: load_i16_stride5_vf32: 4717; AVX512DQ-BW: # %bb.0: 4718; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm0 4719; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1 4720; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2 4721; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3 4722; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4 4723; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 4724; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 4725; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 4726; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 4727; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 4728; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 4729; AVX512DQ-BW-NEXT: kmovd %eax, %k1 4730; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 4731; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 4732; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 4733; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 4734; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4735; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 4736; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 4737; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 4738; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} 4739; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 4740; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 4741; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 4742; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4743; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 4744; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 4745; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 4746; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} 4747; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 4748; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 4749; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 4750; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4751; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 4752; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 4753; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 4754; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 4755; AVX512DQ-BW-NEXT: kmovd %eax, %k1 4756; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 4757; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 4758; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 4759; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 4760; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 4761; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 4762; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 4763; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 4764; AVX512DQ-BW-NEXT: movb $7, %al 4765; AVX512DQ-BW-NEXT: kmovd %eax, %k1 4766; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 4767; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 4768; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 4769; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rsi) 4770; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rdx) 4771; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx) 4772; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r8) 4773; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%r9) 4774; AVX512DQ-BW-NEXT: vzeroupper 4775; AVX512DQ-BW-NEXT: retq 4776; 4777; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf32: 4778; AVX512DQ-BW-FCP: # %bb.0: 4779; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm0 4780; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1 4781; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 4782; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3 4783; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4 4784; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 4785; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3] 4786; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm5 4787; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 4788; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6 4789; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 4790; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 4791; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm6 {%k1} 4792; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 4793; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm6, %zmm5 4794; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 4795; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3] 4796; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm6 4797; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 4798; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7 4799; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm7 {%k1} 4800; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 4801; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm7, %zmm6 4802; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 4803; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3] 4804; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm7 4805; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 4806; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm8 4807; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm8 {%k1} 4808; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 4809; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm8, %zmm7 4810; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 4811; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3] 4812; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm8 4813; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 4814; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm2, %zmm9 4815; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 4816; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 4817; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm9 {%k1} 4818; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 4819; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm9, %zmm8 4820; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 4821; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9 4822; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 4823; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 4824; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm1 4825; AVX512DQ-BW-FCP-NEXT: movb $7, %al 4826; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 4827; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 4828; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 4829; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 4830; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rsi) 4831; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rdx) 4832; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx) 4833; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r8) 4834; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) 4835; AVX512DQ-BW-FCP-NEXT: vzeroupper 4836; AVX512DQ-BW-FCP-NEXT: retq 4837 %wide.vec = load <160 x i16>, ptr %in.vec, align 64 4838 %strided.vec0 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155> 4839 %strided.vec1 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156> 4840 %strided.vec2 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157> 4841 %strided.vec3 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158> 4842 %strided.vec4 = shufflevector <160 x i16> %wide.vec, <160 x i16> poison, <32 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159> 4843 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64 4844 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64 4845 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64 4846 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64 4847 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64 4848 ret void 4849} 4850 4851define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4) nounwind { 4852; SSE-LABEL: load_i16_stride5_vf64: 4853; SSE: # %bb.0: 4854; SSE-NEXT: subq $1016, %rsp # imm = 0x3F8 4855; SSE-NEXT: movdqa 464(%rdi), %xmm5 4856; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4857; SSE-NEXT: movdqa 400(%rdi), %xmm8 4858; SSE-NEXT: movdqa 416(%rdi), %xmm11 4859; SSE-NEXT: movdqa 448(%rdi), %xmm4 4860; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4861; SSE-NEXT: movdqa 432(%rdi), %xmm7 4862; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4863; SSE-NEXT: movdqa 144(%rdi), %xmm6 4864; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4865; SSE-NEXT: movdqa 80(%rdi), %xmm15 4866; SSE-NEXT: movdqa 96(%rdi), %xmm10 4867; SSE-NEXT: movdqa 128(%rdi), %xmm14 4868; SSE-NEXT: movdqa 112(%rdi), %xmm2 4869; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4870; SSE-NEXT: movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,0,65535,65535,65535] 4871; SSE-NEXT: movdqa %xmm0, %xmm1 4872; SSE-NEXT: pandn %xmm2, %xmm1 4873; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[0,1,0,3] 4874; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4875; SSE-NEXT: pand %xmm0, %xmm2 4876; SSE-NEXT: por %xmm1, %xmm2 4877; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,1,2,3] 4878; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4879; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4880; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3] 4881; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 4882; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 4883; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3] 4884; SSE-NEXT: movaps {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] 4885; SSE-NEXT: andps %xmm13, %xmm3 4886; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,1,0,1] 4887; SSE-NEXT: movaps %xmm13, %xmm2 4888; SSE-NEXT: pandn %xmm1, %xmm2 4889; SSE-NEXT: por %xmm3, %xmm2 4890; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4891; SSE-NEXT: movdqa %xmm0, %xmm1 4892; SSE-NEXT: pandn %xmm7, %xmm1 4893; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,1,0,3] 4894; SSE-NEXT: pand %xmm0, %xmm2 4895; SSE-NEXT: por %xmm1, %xmm2 4896; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3] 4897; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4898; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4899; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[0,2,2,3] 4900; SSE-NEXT: movdqa %xmm8, %xmm6 4901; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4902; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,3,2,3,4,5,6,7] 4903; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4904; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] 4905; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[0,1,0,1] 4906; SSE-NEXT: movaps %xmm13, %xmm2 4907; SSE-NEXT: andnps %xmm1, %xmm2 4908; SSE-NEXT: movdqa 32(%rdi), %xmm3 4909; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4910; SSE-NEXT: andps %xmm13, %xmm4 4911; SSE-NEXT: orps %xmm4, %xmm2 4912; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4913; SSE-NEXT: movdqa %xmm0, %xmm1 4914; SSE-NEXT: pandn %xmm3, %xmm1 4915; SSE-NEXT: movdqa 48(%rdi), %xmm2 4916; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4917; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 4918; SSE-NEXT: pand %xmm0, %xmm2 4919; SSE-NEXT: por %xmm1, %xmm2 4920; SSE-NEXT: movdqa 16(%rdi), %xmm1 4921; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4922; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 4923; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4924; SSE-NEXT: movdqa (%rdi), %xmm9 4925; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 4926; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4927; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 4928; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4929; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] 4930; SSE-NEXT: movdqa 64(%rdi), %xmm1 4931; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4932; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 4933; SSE-NEXT: movaps %xmm13, %xmm2 4934; SSE-NEXT: andnps %xmm1, %xmm2 4935; SSE-NEXT: andps %xmm13, %xmm4 4936; SSE-NEXT: orps %xmm4, %xmm2 4937; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4938; SSE-NEXT: movdqa 352(%rdi), %xmm2 4939; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4940; SSE-NEXT: movdqa %xmm0, %xmm1 4941; SSE-NEXT: pandn %xmm2, %xmm1 4942; SSE-NEXT: movdqa 368(%rdi), %xmm2 4943; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4944; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 4945; SSE-NEXT: pand %xmm0, %xmm2 4946; SSE-NEXT: por %xmm1, %xmm2 4947; SSE-NEXT: movdqa 336(%rdi), %xmm1 4948; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 4949; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 4950; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4951; SSE-NEXT: movdqa 320(%rdi), %xmm7 4952; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,2,2,3] 4953; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4954; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 4955; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4956; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] 4957; SSE-NEXT: movdqa 384(%rdi), %xmm1 4958; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4959; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 4960; SSE-NEXT: movaps %xmm13, %xmm2 4961; SSE-NEXT: andnps %xmm1, %xmm2 4962; SSE-NEXT: andps %xmm13, %xmm4 4963; SSE-NEXT: orps %xmm4, %xmm2 4964; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4965; SSE-NEXT: movdqa 272(%rdi), %xmm2 4966; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4967; SSE-NEXT: movdqa %xmm0, %xmm1 4968; SSE-NEXT: pandn %xmm2, %xmm1 4969; SSE-NEXT: movdqa 288(%rdi), %xmm2 4970; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4971; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 4972; SSE-NEXT: pand %xmm0, %xmm2 4973; SSE-NEXT: por %xmm1, %xmm2 4974; SSE-NEXT: movdqa 256(%rdi), %xmm12 4975; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[3,1,2,3] 4976; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4977; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 4978; SSE-NEXT: movdqa 240(%rdi), %xmm3 4979; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4980; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] 4981; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 4982; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 4983; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] 4984; SSE-NEXT: movdqa 304(%rdi), %xmm1 4985; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4986; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 4987; SSE-NEXT: movaps %xmm13, %xmm2 4988; SSE-NEXT: andnps %xmm1, %xmm2 4989; SSE-NEXT: andps %xmm13, %xmm4 4990; SSE-NEXT: orps %xmm4, %xmm2 4991; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4992; SSE-NEXT: movdqa 592(%rdi), %xmm2 4993; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4994; SSE-NEXT: movdqa %xmm0, %xmm1 4995; SSE-NEXT: pandn %xmm2, %xmm1 4996; SSE-NEXT: movdqa 608(%rdi), %xmm2 4997; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 4998; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3] 4999; SSE-NEXT: pand %xmm0, %xmm2 5000; SSE-NEXT: por %xmm1, %xmm2 5001; SSE-NEXT: movdqa 576(%rdi), %xmm1 5002; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5003; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 5004; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 5005; SSE-NEXT: movdqa 560(%rdi), %xmm3 5006; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3] 5007; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5008; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 5009; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5010; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm2[2,3] 5011; SSE-NEXT: movdqa 624(%rdi), %xmm1 5012; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5013; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 5014; SSE-NEXT: movaps %xmm13, %xmm2 5015; SSE-NEXT: andnps %xmm1, %xmm2 5016; SSE-NEXT: andps %xmm13, %xmm4 5017; SSE-NEXT: orps %xmm4, %xmm2 5018; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5019; SSE-NEXT: movdqa 192(%rdi), %xmm2 5020; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5021; SSE-NEXT: movdqa %xmm0, %xmm1 5022; SSE-NEXT: pandn %xmm2, %xmm1 5023; SSE-NEXT: movdqa 208(%rdi), %xmm2 5024; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5025; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,1,0,3] 5026; SSE-NEXT: pand %xmm0, %xmm4 5027; SSE-NEXT: por %xmm1, %xmm4 5028; SSE-NEXT: movdqa 176(%rdi), %xmm1 5029; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5030; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 5031; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 5032; SSE-NEXT: movdqa 160(%rdi), %xmm2 5033; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5034; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,2,2,3] 5035; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,3,4,5,6,7] 5036; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 5037; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,3] 5038; SSE-NEXT: movdqa 224(%rdi), %xmm1 5039; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5040; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] 5041; SSE-NEXT: movaps %xmm13, %xmm4 5042; SSE-NEXT: andnps %xmm1, %xmm4 5043; SSE-NEXT: andps %xmm13, %xmm5 5044; SSE-NEXT: orps %xmm5, %xmm4 5045; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5046; SSE-NEXT: movdqa 528(%rdi), %xmm1 5047; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5048; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3] 5049; SSE-NEXT: pand %xmm0, %xmm1 5050; SSE-NEXT: movdqa 512(%rdi), %xmm2 5051; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5052; SSE-NEXT: pandn %xmm2, %xmm0 5053; SSE-NEXT: por %xmm1, %xmm0 5054; SSE-NEXT: movdqa 496(%rdi), %xmm1 5055; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5056; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3] 5057; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7] 5058; SSE-NEXT: movdqa 480(%rdi), %xmm2 5059; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5060; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[0,2,2,3] 5061; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 5062; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5063; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm0[2,3] 5064; SSE-NEXT: movdqa 544(%rdi), %xmm0 5065; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5066; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] 5067; SSE-NEXT: movaps %xmm13, %xmm1 5068; SSE-NEXT: andnps %xmm0, %xmm1 5069; SSE-NEXT: andps %xmm13, %xmm4 5070; SSE-NEXT: orps %xmm4, %xmm1 5071; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5072; SSE-NEXT: psrlq $48, %xmm10 5073; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5074; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,3,2,3] 5075; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] 5076; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1] 5077; SSE-NEXT: movdqa {{.*#+}} xmm0 = [0,0,0,65535,65535,65535,65535,65535] 5078; SSE-NEXT: movdqa %xmm0, %xmm4 5079; SSE-NEXT: pandn %xmm1, %xmm4 5080; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,3,2,3] 5081; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5082; SSE-NEXT: # xmm5 = mem[0,2,2,3] 5083; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 5084; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] 5085; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 5086; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 5087; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] 5088; SSE-NEXT: pand %xmm0, %xmm1 5089; SSE-NEXT: por %xmm4, %xmm1 5090; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5091; SSE-NEXT: movdqa %xmm8, %xmm4 5092; SSE-NEXT: psllq $48, %xmm4 5093; SSE-NEXT: movaps %xmm13, %xmm2 5094; SSE-NEXT: andnps %xmm4, %xmm2 5095; SSE-NEXT: pand %xmm13, %xmm1 5096; SSE-NEXT: orps %xmm1, %xmm2 5097; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5098; SSE-NEXT: psrlq $48, %xmm11 5099; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm6[0,3,2,3] 5100; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 5101; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1] 5102; SSE-NEXT: movdqa %xmm0, %xmm1 5103; SSE-NEXT: pandn %xmm4, %xmm1 5104; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5105; SSE-NEXT: # xmm4 = mem[1,3,2,3] 5106; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5107; SSE-NEXT: # xmm5 = mem[0,2,2,3] 5108; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 5109; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] 5110; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 5111; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 5112; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] 5113; SSE-NEXT: pand %xmm0, %xmm4 5114; SSE-NEXT: por %xmm1, %xmm4 5115; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5116; SSE-NEXT: psllq $48, %xmm1 5117; SSE-NEXT: movdqa %xmm13, %xmm2 5118; SSE-NEXT: pandn %xmm1, %xmm2 5119; SSE-NEXT: pand %xmm13, %xmm4 5120; SSE-NEXT: por %xmm4, %xmm2 5121; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5122; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5123; SSE-NEXT: psrlq $48, %xmm1 5124; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm9[0,3,2,3] 5125; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 5126; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5127; SSE-NEXT: movdqa %xmm0, %xmm1 5128; SSE-NEXT: pandn %xmm4, %xmm1 5129; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5130; SSE-NEXT: # xmm4 = mem[1,3,2,3] 5131; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5132; SSE-NEXT: # xmm5 = mem[0,2,2,3] 5133; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 5134; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] 5135; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 5136; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 5137; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] 5138; SSE-NEXT: pand %xmm0, %xmm4 5139; SSE-NEXT: por %xmm1, %xmm4 5140; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5141; SSE-NEXT: psllq $48, %xmm1 5142; SSE-NEXT: movdqa %xmm13, %xmm2 5143; SSE-NEXT: pandn %xmm1, %xmm2 5144; SSE-NEXT: pand %xmm13, %xmm4 5145; SSE-NEXT: por %xmm4, %xmm2 5146; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5147; SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 5148; SSE-NEXT: psrlq $48, %xmm1 5149; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm7[0,3,2,3] 5150; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 5151; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5152; SSE-NEXT: movdqa %xmm0, %xmm1 5153; SSE-NEXT: pandn %xmm4, %xmm1 5154; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5155; SSE-NEXT: # xmm4 = mem[1,3,2,3] 5156; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5157; SSE-NEXT: # xmm5 = mem[0,2,2,3] 5158; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 5159; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] 5160; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 5161; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 5162; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] 5163; SSE-NEXT: pand %xmm0, %xmm4 5164; SSE-NEXT: por %xmm1, %xmm4 5165; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5166; SSE-NEXT: psllq $48, %xmm1 5167; SSE-NEXT: movdqa %xmm13, %xmm2 5168; SSE-NEXT: pandn %xmm1, %xmm2 5169; SSE-NEXT: pand %xmm13, %xmm4 5170; SSE-NEXT: por %xmm4, %xmm2 5171; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5172; SSE-NEXT: psrlq $48, %xmm12 5173; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5174; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,3,2,3] 5175; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 5176; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1] 5177; SSE-NEXT: movdqa %xmm0, %xmm1 5178; SSE-NEXT: pandn %xmm4, %xmm1 5179; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5180; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[1,3,2,3] 5181; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5182; SSE-NEXT: # xmm5 = mem[0,2,2,3] 5183; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 5184; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] 5185; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 5186; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 5187; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] 5188; SSE-NEXT: pand %xmm0, %xmm4 5189; SSE-NEXT: por %xmm1, %xmm4 5190; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 5191; SSE-NEXT: movdqa %xmm12, %xmm1 5192; SSE-NEXT: psllq $48, %xmm1 5193; SSE-NEXT: movdqa %xmm13, %xmm2 5194; SSE-NEXT: pandn %xmm1, %xmm2 5195; SSE-NEXT: pand %xmm13, %xmm4 5196; SSE-NEXT: por %xmm4, %xmm2 5197; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5198; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5199; SSE-NEXT: psrlq $48, %xmm1 5200; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,3,2,3] 5201; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 5202; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5203; SSE-NEXT: movdqa %xmm0, %xmm1 5204; SSE-NEXT: pandn %xmm4, %xmm1 5205; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5206; SSE-NEXT: # xmm4 = mem[1,3,2,3] 5207; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5208; SSE-NEXT: # xmm5 = mem[0,2,2,3] 5209; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 5210; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] 5211; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 5212; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 5213; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] 5214; SSE-NEXT: pand %xmm0, %xmm4 5215; SSE-NEXT: por %xmm1, %xmm4 5216; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5217; SSE-NEXT: psllq $48, %xmm1 5218; SSE-NEXT: movdqa %xmm13, %xmm2 5219; SSE-NEXT: pandn %xmm1, %xmm2 5220; SSE-NEXT: pand %xmm13, %xmm4 5221; SSE-NEXT: por %xmm4, %xmm2 5222; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5223; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5224; SSE-NEXT: movdqa %xmm7, %xmm1 5225; SSE-NEXT: psrlq $48, %xmm1 5226; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5227; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm11[0,3,2,3] 5228; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 5229; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5230; SSE-NEXT: movdqa %xmm0, %xmm1 5231; SSE-NEXT: pandn %xmm4, %xmm1 5232; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5233; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3] 5234; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5235; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3] 5236; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 5237; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,7,5,6,7] 5238; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,2,1] 5239; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] 5240; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,6,4,7] 5241; SSE-NEXT: pand %xmm0, %xmm4 5242; SSE-NEXT: por %xmm1, %xmm4 5243; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5244; SSE-NEXT: movdqa %xmm3, %xmm1 5245; SSE-NEXT: psllq $48, %xmm1 5246; SSE-NEXT: movdqa %xmm13, %xmm5 5247; SSE-NEXT: pandn %xmm1, %xmm5 5248; SSE-NEXT: pand %xmm13, %xmm4 5249; SSE-NEXT: por %xmm4, %xmm5 5250; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5251; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5252; SSE-NEXT: psrlq $48, %xmm1 5253; SSE-NEXT: pshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5254; SSE-NEXT: # xmm4 = mem[0,3,2,3] 5255; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[1,2,2,3,4,5,6,7] 5256; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5257; SSE-NEXT: pshufd $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5258; SSE-NEXT: # xmm1 = mem[1,3,2,3] 5259; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5260; SSE-NEXT: # xmm5 = mem[0,2,2,3] 5261; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 5262; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,7,5,6,7] 5263; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1] 5264; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] 5265; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,4,7] 5266; SSE-NEXT: pand %xmm0, %xmm1 5267; SSE-NEXT: pandn %xmm4, %xmm0 5268; SSE-NEXT: por %xmm1, %xmm0 5269; SSE-NEXT: pand %xmm13, %xmm0 5270; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5271; SSE-NEXT: psllq $48, %xmm1 5272; SSE-NEXT: pandn %xmm1, %xmm13 5273; SSE-NEXT: por %xmm0, %xmm13 5274; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5275; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5276; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5277; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0] 5278; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3] 5279; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5280; SSE-NEXT: movaps {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535] 5281; SSE-NEXT: movaps %xmm6, %xmm4 5282; SSE-NEXT: andnps %xmm1, %xmm4 5283; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,1,1,3] 5284; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 5285; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5286; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 5287; SSE-NEXT: pand %xmm6, %xmm5 5288; SSE-NEXT: por %xmm4, %xmm5 5289; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5290; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,1,2,0] 5291; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5292; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5293; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] 5294; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5295; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5296; SSE-NEXT: movaps %xmm0, %xmm1 5297; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5298; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] 5299; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[2,3] 5300; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5301; SSE-NEXT: movaps %xmm6, %xmm4 5302; SSE-NEXT: andnps %xmm1, %xmm4 5303; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5304; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm8[0,1,1,3] 5305; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 5306; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5307; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm15[2],xmm5[3],xmm15[3] 5308; SSE-NEXT: pand %xmm6, %xmm5 5309; SSE-NEXT: por %xmm4, %xmm5 5310; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5311; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5312; SSE-NEXT: # xmm4 = mem[0,1,2,0] 5313; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5314; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5315; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] 5316; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5317; SSE-NEXT: movdqa %xmm10, %xmm1 5318; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5319; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm4[0,0] 5320; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[2,3] 5321; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5322; SSE-NEXT: movaps %xmm6, %xmm4 5323; SSE-NEXT: andnps %xmm1, %xmm4 5324; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm14[0,1,1,3] 5325; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7] 5326; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5327; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3] 5328; SSE-NEXT: pand %xmm6, %xmm5 5329; SSE-NEXT: por %xmm4, %xmm5 5330; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5331; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,1,2,0] 5332; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5333; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5334; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm4[2,0] 5335; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5336; SSE-NEXT: movdqa %xmm2, %xmm1 5337; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm9[0,0] 5338; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm9[2,3] 5339; SSE-NEXT: movdqa %xmm9, %xmm12 5340; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5341; SSE-NEXT: movaps %xmm6, %xmm4 5342; SSE-NEXT: andnps %xmm1, %xmm4 5343; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm11[0,1,1,3] 5344; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] 5345; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm7[2],xmm2[3],xmm7[3] 5346; SSE-NEXT: pand %xmm6, %xmm2 5347; SSE-NEXT: por %xmm4, %xmm2 5348; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5349; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,1,2,0] 5350; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5351; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5352; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] 5353; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5354; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5355; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5356; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] 5357; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] 5358; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5359; SSE-NEXT: movaps %xmm6, %xmm4 5360; SSE-NEXT: andnps %xmm1, %xmm4 5361; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5362; SSE-NEXT: # xmm5 = mem[0,1,1,3] 5363; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] 5364; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5365; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 5366; SSE-NEXT: pand %xmm6, %xmm2 5367; SSE-NEXT: por %xmm4, %xmm2 5368; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5369; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5370; SSE-NEXT: # xmm4 = mem[0,1,2,0] 5371; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5372; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5373; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] 5374; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5375; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5376; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5377; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm7[0,0] 5378; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[2,3] 5379; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5380; SSE-NEXT: movaps %xmm6, %xmm4 5381; SSE-NEXT: andnps %xmm1, %xmm4 5382; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5383; SSE-NEXT: # xmm5 = mem[0,1,1,3] 5384; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] 5385; SSE-NEXT: movdqa (%rsp), %xmm9 # 16-byte Reload 5386; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3] 5387; SSE-NEXT: pand %xmm6, %xmm2 5388; SSE-NEXT: por %xmm4, %xmm2 5389; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5390; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5391; SSE-NEXT: # xmm4 = mem[0,1,2,0] 5392; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5393; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5394; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] 5395; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5396; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5397; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5398; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm2[0,0] 5399; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[2,3] 5400; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5401; SSE-NEXT: movaps %xmm6, %xmm4 5402; SSE-NEXT: andnps %xmm1, %xmm4 5403; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5404; SSE-NEXT: # xmm5 = mem[0,1,1,3] 5405; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] 5406; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5407; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 5408; SSE-NEXT: pand %xmm6, %xmm2 5409; SSE-NEXT: por %xmm4, %xmm2 5410; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5411; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5412; SSE-NEXT: # xmm4 = mem[0,1,2,0] 5413; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5414; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5415; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] 5416; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5417; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5418; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 5419; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0] 5420; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3] 5421; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,1,3] 5422; SSE-NEXT: movaps %xmm6, %xmm4 5423; SSE-NEXT: andnps %xmm1, %xmm4 5424; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5425; SSE-NEXT: # xmm5 = mem[0,1,1,3] 5426; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,7,6,7] 5427; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5428; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3] 5429; SSE-NEXT: pand %xmm6, %xmm2 5430; SSE-NEXT: por %xmm4, %xmm2 5431; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7] 5432; SSE-NEXT: pshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5433; SSE-NEXT: # xmm4 = mem[0,1,2,0] 5434; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6,5] 5435; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm1[2,3] 5436; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,0] 5437; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5438; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] 5439; SSE-NEXT: movdqa %xmm6, %xmm4 5440; SSE-NEXT: pandn %xmm1, %xmm4 5441; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] 5442; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm15[0,2,2,3] 5443; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] 5444; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,3,2,3,4,5,6,7] 5445; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5446; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] 5447; SSE-NEXT: pand %xmm6, %xmm1 5448; SSE-NEXT: por %xmm4, %xmm1 5449; SSE-NEXT: movdqa %xmm1, %xmm2 5450; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[3,0] 5451; SSE-NEXT: movaps %xmm6, %xmm3 5452; SSE-NEXT: andnps %xmm13, %xmm3 5453; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5454; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm0[0,2] 5455; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm13[0,1,2,3,7,4,6,7] 5456; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5457; SSE-NEXT: # xmm3 = mem[0,1,0,3] 5458; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] 5459; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] 5460; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,0] 5461; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5462; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 5463; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm14[2,2,2,2,4,5,6,7] 5464; SSE-NEXT: movdqa %xmm6, %xmm3 5465; SSE-NEXT: pandn %xmm1, %xmm3 5466; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5467; SSE-NEXT: # xmm1 = mem[1,1,1,1] 5468; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5469; SSE-NEXT: # xmm4 = mem[0,2,2,3] 5470; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5471; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] 5472; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5473; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] 5474; SSE-NEXT: pand %xmm6, %xmm1 5475; SSE-NEXT: por %xmm3, %xmm1 5476; SSE-NEXT: movdqa %xmm1, %xmm4 5477; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5478; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm14[3,0] 5479; SSE-NEXT: movaps %xmm6, %xmm2 5480; SSE-NEXT: andnps %xmm14, %xmm2 5481; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5482; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[0,2] 5483; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm14[0,1,2,3,7,4,6,7] 5484; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5485; SSE-NEXT: # xmm3 = mem[0,1,0,3] 5486; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,5,6] 5487; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3] 5488; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,0] 5489; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5490; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[2,2,2,2,4,5,6,7] 5491; SSE-NEXT: movdqa %xmm6, %xmm3 5492; SSE-NEXT: pandn %xmm1, %xmm3 5493; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1] 5494; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 5495; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,2,2,3] 5496; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] 5497; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,3,2,3,4,5,6,7] 5498; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5499; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] 5500; SSE-NEXT: pand %xmm6, %xmm1 5501; SSE-NEXT: por %xmm3, %xmm1 5502; SSE-NEXT: movdqa %xmm1, %xmm3 5503; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5504; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[3,0] 5505; SSE-NEXT: movaps %xmm6, %xmm2 5506; SSE-NEXT: andnps %xmm12, %xmm2 5507; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5508; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[0,1],xmm1[0,2] 5509; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm12[0,1,2,3,7,4,6,7] 5510; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5511; SSE-NEXT: # xmm2 = mem[0,1,0,3] 5512; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] 5513; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 5514; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] 5515; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5516; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5517; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] 5518; SSE-NEXT: movdqa %xmm6, %xmm2 5519; SSE-NEXT: pandn %xmm1, %xmm2 5520; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5521; SSE-NEXT: # xmm1 = mem[1,1,1,1] 5522; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 5523; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3] 5524; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 5525; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] 5526; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5527; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,3,4,5,6,7] 5528; SSE-NEXT: pand %xmm6, %xmm1 5529; SSE-NEXT: por %xmm2, %xmm1 5530; SSE-NEXT: movdqa %xmm1, %xmm3 5531; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5532; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] 5533; SSE-NEXT: movaps %xmm4, %xmm2 5534; SSE-NEXT: movaps %xmm6, %xmm4 5535; SSE-NEXT: andnps %xmm2, %xmm4 5536; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5537; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 5538; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] 5539; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5540; SSE-NEXT: # xmm2 = mem[0,1,0,3] 5541; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] 5542; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 5543; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0] 5544; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5545; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[2,2,2,2,4,5,6,7] 5546; SSE-NEXT: movdqa %xmm6, %xmm2 5547; SSE-NEXT: pandn %xmm1, %xmm2 5548; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5549; SSE-NEXT: # xmm1 = mem[1,1,1,1] 5550; SSE-NEXT: movdqa %xmm9, %xmm11 5551; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] 5552; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 5553; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] 5554; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5555; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] 5556; SSE-NEXT: pand %xmm6, %xmm0 5557; SSE-NEXT: por %xmm2, %xmm0 5558; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5559; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[3,0] 5560; SSE-NEXT: movaps %xmm6, %xmm2 5561; SSE-NEXT: andnps %xmm7, %xmm2 5562; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5563; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[0,2] 5564; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,4,6,7] 5565; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5566; SSE-NEXT: # xmm2 = mem[0,1,0,3] 5567; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] 5568; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 5569; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 5570; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5571; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5572; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[2,2,2,2,4,5,6,7] 5573; SSE-NEXT: movdqa %xmm6, %xmm2 5574; SSE-NEXT: pandn %xmm1, %xmm2 5575; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 5576; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1] 5577; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 5578; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[0,2,2,3] 5579; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 5580; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] 5581; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5582; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,0,3,3,4,5,6,7] 5583; SSE-NEXT: pand %xmm6, %xmm0 5584; SSE-NEXT: por %xmm2, %xmm0 5585; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5586; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm4[3,0] 5587; SSE-NEXT: movaps %xmm4, %xmm2 5588; SSE-NEXT: movaps %xmm6, %xmm3 5589; SSE-NEXT: andnps %xmm4, %xmm3 5590; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5591; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 5592; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,4,6,7] 5593; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5594; SSE-NEXT: # xmm2 = mem[0,1,0,3] 5595; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] 5596; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 5597; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] 5598; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5599; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[2,2,2,2,4,5,6,7] 5600; SSE-NEXT: movdqa %xmm6, %xmm2 5601; SSE-NEXT: pandn %xmm1, %xmm2 5602; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 5603; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1] 5604; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 5605; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3] 5606; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 5607; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] 5608; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5609; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[1,0,3,3,4,5,6,7] 5610; SSE-NEXT: pand %xmm6, %xmm14 5611; SSE-NEXT: por %xmm2, %xmm14 5612; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5613; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[3,0] 5614; SSE-NEXT: movaps %xmm6, %xmm0 5615; SSE-NEXT: andnps %xmm10, %xmm0 5616; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5617; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm1[0,2] 5618; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm10[0,1,2,3,7,4,6,7] 5619; SSE-NEXT: pshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5620; SSE-NEXT: # xmm2 = mem[0,1,0,3] 5621; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] 5622; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1],xmm1[2,3] 5623; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm2[2,0] 5624; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 5625; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,2,2,2,4,5,6,7] 5626; SSE-NEXT: movdqa %xmm6, %xmm2 5627; SSE-NEXT: pandn %xmm1, %xmm2 5628; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 5629; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1] 5630; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5631; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3] 5632; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 5633; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,3,2,3,4,5,6,7] 5634; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 5635; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm1[1,0,3,3,4,5,6,7] 5636; SSE-NEXT: pand %xmm6, %xmm10 5637; SSE-NEXT: por %xmm2, %xmm10 5638; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 5639; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5640; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,0] 5641; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2] 5642; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 5643; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5644; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm3[3,0] 5645; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[0,2] 5646; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5647; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[3,0] 5648; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm1[0,2] 5649; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5650; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm5[3,0] 5651; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,2] 5652; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5653; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm11[3,0] 5654; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] 5655; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill 5656; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[2,0],xmm9[3,0] 5657; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm15[0,2] 5658; SSE-NEXT: movdqa %xmm7, %xmm1 5659; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[2,0],xmm7[3,0] 5660; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[0,2] 5661; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5662; SSE-NEXT: movdqa %xmm4, %xmm1 5663; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[2,0],xmm4[3,0] 5664; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[0,2] 5665; SSE-NEXT: movaps %xmm1, %xmm15 5666; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 5667; SSE-NEXT: movaps %xmm4, %xmm1 5668; SSE-NEXT: movaps %xmm4, %xmm12 5669; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0] 5670; SSE-NEXT: movaps %xmm0, %xmm11 5671; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7] 5672; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5673; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm2[2,1,3,3,4,5,6,7] 5674; SSE-NEXT: pand %xmm6, %xmm8 5675; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,6,6,7] 5676; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5677; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm2[2,1,3,3,4,5,6,7] 5678; SSE-NEXT: pand %xmm6, %xmm7 5679; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm13[0,1,2,3,4,6,6,7] 5680; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5681; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,1,3,3,4,5,6,7] 5682; SSE-NEXT: pand %xmm6, %xmm0 5683; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,4,6,6,7] 5684; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5685; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[2,1,3,3,4,5,6,7] 5686; SSE-NEXT: pand %xmm6, %xmm5 5687; SSE-NEXT: pshufhw $232, (%rsp), %xmm2 # 16-byte Folded Reload 5688; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] 5689; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5690; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[2,1,3,3,4,5,6,7] 5691; SSE-NEXT: pand %xmm6, %xmm4 5692; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,4,6,6,7] 5693; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5694; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,1,3,3,4,5,6,7] 5695; SSE-NEXT: pand %xmm6, %xmm3 5696; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5697; SSE-NEXT: # xmm2 = mem[0,1,2,3,4,6,6,7] 5698; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 5699; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[2,1,3,3,4,5,6,7] 5700; SSE-NEXT: pand %xmm6, %xmm2 5701; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm15[0,1,2,3,4,6,6,7] 5702; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3] 5703; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[2,1,3,3,4,5,6,7] 5704; SSE-NEXT: pand %xmm6, %xmm9 5705; SSE-NEXT: andnps %xmm11, %xmm6 5706; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,2] 5707; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm11[0,1,2,3,7,4,6,7] 5708; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 5709; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[0,1,0,3] 5710; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] 5711; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[2,3] 5712; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm15[2,0] 5713; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 5714; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5715; SSE-NEXT: # xmm1 = mem[0,2,2,3] 5716; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5717; SSE-NEXT: # xmm15 = mem[0,1,1,3] 5718; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5719; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] 5720; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] 5721; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,0] 5722; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 5723; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5724; SSE-NEXT: # xmm1 = mem[0,2,2,3] 5725; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5726; SSE-NEXT: # xmm15 = mem[0,1,1,3] 5727; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5728; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] 5729; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] 5730; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm15[2,0] 5731; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 5732; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5733; SSE-NEXT: # xmm1 = mem[0,2,2,3] 5734; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5735; SSE-NEXT: # xmm15 = mem[0,1,1,3] 5736; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5737; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] 5738; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] 5739; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,0] 5740; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 5741; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5742; SSE-NEXT: # xmm1 = mem[0,2,2,3] 5743; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5744; SSE-NEXT: # xmm15 = mem[0,1,1,3] 5745; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5746; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] 5747; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] 5748; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm15[2,0] 5749; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 5750; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5751; SSE-NEXT: # xmm1 = mem[0,2,2,3] 5752; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5753; SSE-NEXT: # xmm15 = mem[0,1,1,3] 5754; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5755; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] 5756; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] 5757; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,0] 5758; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 5759; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5760; SSE-NEXT: # xmm1 = mem[0,2,2,3] 5761; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5762; SSE-NEXT: # xmm15 = mem[0,1,1,3] 5763; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5764; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] 5765; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] 5766; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm15[2,0] 5767; SSE-NEXT: por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 5768; SSE-NEXT: pshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 5769; SSE-NEXT: # xmm1 = mem[0,2,2,3] 5770; SSE-NEXT: pshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 5771; SSE-NEXT: # xmm15 = mem[0,1,1,3] 5772; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5773; SSE-NEXT: pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,4,7] 5774; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[3,1],xmm1[1,3] 5775; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,0] 5776; SSE-NEXT: orps %xmm9, %xmm6 5777; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,2,2,3] 5778; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm11[0,1,1,3] 5779; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 5780; SSE-NEXT: pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7] 5781; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[3,1],xmm1[1,3] 5782; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,0] 5783; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5784; SSE-NEXT: movaps %xmm1, 96(%rsi) 5785; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5786; SSE-NEXT: movaps %xmm1, 32(%rsi) 5787; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5788; SSE-NEXT: movaps %xmm1, 112(%rsi) 5789; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5790; SSE-NEXT: movaps %xmm1, 48(%rsi) 5791; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5792; SSE-NEXT: movaps %xmm1, 64(%rsi) 5793; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5794; SSE-NEXT: movaps %xmm1, (%rsi) 5795; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5796; SSE-NEXT: movaps %xmm1, 80(%rsi) 5797; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5798; SSE-NEXT: movaps %xmm1, 16(%rsi) 5799; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5800; SSE-NEXT: movaps %xmm1, 96(%rdx) 5801; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5802; SSE-NEXT: movaps %xmm1, 32(%rdx) 5803; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5804; SSE-NEXT: movaps %xmm1, 112(%rdx) 5805; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5806; SSE-NEXT: movaps %xmm1, 48(%rdx) 5807; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5808; SSE-NEXT: movaps %xmm1, 64(%rdx) 5809; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5810; SSE-NEXT: movaps %xmm1, (%rdx) 5811; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5812; SSE-NEXT: movaps %xmm1, 80(%rdx) 5813; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5814; SSE-NEXT: movaps %xmm1, 16(%rdx) 5815; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5816; SSE-NEXT: movaps %xmm1, 96(%rcx) 5817; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5818; SSE-NEXT: movaps %xmm1, 112(%rcx) 5819; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5820; SSE-NEXT: movaps %xmm1, 64(%rcx) 5821; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5822; SSE-NEXT: movaps %xmm1, 80(%rcx) 5823; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5824; SSE-NEXT: movaps %xmm1, 32(%rcx) 5825; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5826; SSE-NEXT: movaps %xmm1, 48(%rcx) 5827; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5828; SSE-NEXT: movaps %xmm1, (%rcx) 5829; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5830; SSE-NEXT: movaps %xmm1, 16(%rcx) 5831; SSE-NEXT: movaps %xmm10, 112(%r8) 5832; SSE-NEXT: movaps %xmm14, 96(%r8) 5833; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5834; SSE-NEXT: movaps %xmm1, 80(%r8) 5835; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5836; SSE-NEXT: movaps %xmm1, 64(%r8) 5837; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5838; SSE-NEXT: movaps %xmm1, 48(%r8) 5839; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5840; SSE-NEXT: movaps %xmm1, 32(%r8) 5841; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5842; SSE-NEXT: movaps %xmm1, 16(%r8) 5843; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 5844; SSE-NEXT: movaps %xmm1, (%r8) 5845; SSE-NEXT: movaps %xmm6, 112(%r9) 5846; SSE-NEXT: movaps %xmm2, 96(%r9) 5847; SSE-NEXT: movaps %xmm3, 80(%r9) 5848; SSE-NEXT: movaps %xmm4, 64(%r9) 5849; SSE-NEXT: movaps %xmm5, 48(%r9) 5850; SSE-NEXT: movaps %xmm0, 32(%r9) 5851; SSE-NEXT: movaps %xmm7, 16(%r9) 5852; SSE-NEXT: movaps %xmm8, (%r9) 5853; SSE-NEXT: addq $1016, %rsp # imm = 0x3F8 5854; SSE-NEXT: retq 5855; 5856; AVX-LABEL: load_i16_stride5_vf64: 5857; AVX: # %bb.0: 5858; AVX-NEXT: subq $1032, %rsp # imm = 0x408 5859; AVX-NEXT: vmovdqa 304(%rdi), %xmm0 5860; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5861; AVX-NEXT: vmovdqa 288(%rdi), %xmm1 5862; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5863; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7] 5864; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 5865; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 5866; AVX-NEXT: vmovdqa 256(%rdi), %xmm2 5867; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5868; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,1,3] 5869; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 5870; AVX-NEXT: vmovdqa 272(%rdi), %xmm15 5871; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm15[1] 5872; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5873; AVX-NEXT: vmovdqa 240(%rdi), %xmm3 5874; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5875; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 5876; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 5877; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] 5878; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm0[5,6,7] 5879; AVX-NEXT: vmovdqa 208(%rdi), %xmm0 5880; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5881; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,3] 5882; AVX-NEXT: vmovdqa 192(%rdi), %xmm3 5883; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5884; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4],xmm0[5,6,7] 5885; AVX-NEXT: vmovdqa 176(%rdi), %xmm3 5886; AVX-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill 5887; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3] 5888; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 5889; AVX-NEXT: vmovdqa 160(%rdi), %xmm4 5890; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5891; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 5892; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 5893; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 5894; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4,5,6,7] 5895; AVX-NEXT: vmovaps {{.*#+}} ymm5 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,65535,65535,65535] 5896; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 5897; AVX-NEXT: vmovaps 224(%rdi), %xmm0 5898; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5899; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] 5900; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 5901; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 5902; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 5903; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5904; AVX-NEXT: vmovdqa 576(%rdi), %xmm0 5905; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5906; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 5907; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 5908; AVX-NEXT: vmovdqa 592(%rdi), %xmm12 5909; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm12[1] 5910; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5911; AVX-NEXT: vmovdqa 560(%rdi), %xmm0 5912; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5913; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] 5914; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 5915; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] 5916; AVX-NEXT: vmovdqa 624(%rdi), %xmm3 5917; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5918; AVX-NEXT: vmovdqa 608(%rdi), %xmm0 5919; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5920; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6,7] 5921; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 5922; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] 5923; AVX-NEXT: vmovdqa 496(%rdi), %xmm0 5924; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5925; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] 5926; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 5927; AVX-NEXT: vmovdqa 480(%rdi), %xmm9 5928; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm9[0,2,2,3] 5929; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5930; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 5931; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 5932; AVX-NEXT: vmovdqa 528(%rdi), %xmm0 5933; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5934; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,1,0,3] 5935; AVX-NEXT: vmovdqa 512(%rdi), %xmm13 5936; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm13[4],xmm4[5,6,7] 5937; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5938; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 5939; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 5940; AVX-NEXT: vmovaps 544(%rdi), %xmm11 5941; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm11[0,1,0,1] 5942; AVX-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5943; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 5944; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 5945; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 5946; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5947; AVX-NEXT: vmovdqa 96(%rdi), %xmm10 5948; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,1,3] 5949; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5950; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 5951; AVX-NEXT: vmovdqa 112(%rdi), %xmm0 5952; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5953; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] 5954; AVX-NEXT: vmovdqa 80(%rdi), %xmm0 5955; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5956; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] 5957; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 5958; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] 5959; AVX-NEXT: vmovdqa 144(%rdi), %xmm7 5960; AVX-NEXT: vmovdqa 128(%rdi), %xmm6 5961; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1],xmm7[2,3],xmm6[4,5,6,7] 5962; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5963; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5964; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 5965; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5,6,7] 5966; AVX-NEXT: vmovdqa 16(%rdi), %xmm0 5967; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5968; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] 5969; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 5970; AVX-NEXT: vmovdqa (%rdi), %xmm0 5971; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5972; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[0,2,2,3] 5973; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,3,2,3,4,5,6,7] 5974; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 5975; AVX-NEXT: vmovdqa 32(%rdi), %xmm0 5976; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5977; AVX-NEXT: vmovdqa 48(%rdi), %xmm4 5978; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5979; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,1,0,3] 5980; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm0[4],xmm4[5,6,7] 5981; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 5982; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 5983; AVX-NEXT: vmovaps 64(%rdi), %xmm0 5984; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5985; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm0[0,1,0,1] 5986; AVX-NEXT: vandnps %ymm4, %ymm5, %ymm4 5987; AVX-NEXT: vorps %ymm4, %ymm3, %ymm3 5988; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm0 5989; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 5990; AVX-NEXT: vmovdqa 464(%rdi), %xmm8 5991; AVX-NEXT: vmovdqa 448(%rdi), %xmm0 5992; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5993; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7] 5994; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5995; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 5996; AVX-NEXT: vmovdqa 416(%rdi), %xmm0 5997; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 5998; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,1,3] 5999; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7] 6000; AVX-NEXT: vmovdqa 432(%rdi), %xmm0 6001; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6002; AVX-NEXT: vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1] 6003; AVX-NEXT: vmovdqa 400(%rdi), %xmm0 6004; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6005; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] 6006; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 6007; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] 6008; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4],xmm1[5,6,7] 6009; AVX-NEXT: vmovdqa 336(%rdi), %xmm0 6010; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6011; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] 6012; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] 6013; AVX-NEXT: vmovdqa 320(%rdi), %xmm0 6014; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6015; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,2,2,3] 6016; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,3,2,3,4,5,6,7] 6017; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 6018; AVX-NEXT: vmovdqa 368(%rdi), %xmm0 6019; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6020; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,3] 6021; AVX-NEXT: vmovdqa 352(%rdi), %xmm0 6022; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6023; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] 6024; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7] 6025; AVX-NEXT: vandps %ymm5, %ymm2, %ymm2 6026; AVX-NEXT: vmovaps 384(%rdi), %xmm0 6027; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6028; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm0[0,1,0,1] 6029; AVX-NEXT: vandnps %ymm3, %ymm5, %ymm3 6030; AVX-NEXT: vorps %ymm3, %ymm2, %ymm2 6031; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm0 6032; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6033; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload 6034; AVX-NEXT: # xmm1 = xmm15[0,1],mem[2,3],xmm15[4,5,6,7] 6035; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,0,4,5,6,7] 6036; AVX-NEXT: vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5] 6037; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6038; AVX-NEXT: # xmm2 = mem[0,3,2,3] 6039; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] 6040; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] 6041; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6042; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 6043; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm0[4,5],mem[6,7] 6044; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 6045; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 6046; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] 6047; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6048; AVX-NEXT: # xmm1 = mem[0,3,2,3] 6049; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,2,2,3,4,5,6,7] 6050; AVX-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 6051; AVX-NEXT: vpsrlq $48, %xmm0, %xmm15 6052; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm1[0],xmm15[0],xmm1[1],xmm15[1] 6053; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6054; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload 6055; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] 6056; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,0,1,0,1,0,1,10,11,4,5,14,15,6,7] 6057; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 6058; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3,4,5,6,7] 6059; AVX-NEXT: vandps %ymm5, %ymm0, %ymm0 6060; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6061; AVX-NEXT: vpsllq $48, %xmm4, %xmm15 6062; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15 6063; AVX-NEXT: vorps %ymm0, %ymm15, %ymm0 6064; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 6065; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6066; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm0 # 16-byte Folded Reload 6067; AVX-NEXT: # xmm0 = xmm12[0,1],mem[2,3],xmm12[4,5,6,7] 6068; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] 6069; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 6070; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 6071; AVX-NEXT: # xmm3 = mem[0,3,2,3] 6072; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 6073; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] 6074; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6075; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm3 # 16-byte Folded Reload 6076; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm14[4,5],mem[6,7] 6077; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 6078; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 6079; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[0,3,2,3] 6080; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 6081; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6082; AVX-NEXT: vpsrlq $48, %xmm4, %xmm15 6083; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] 6084; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6085; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm9[2,3],xmm13[4,5],xmm9[6,7] 6086; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 6087; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] 6088; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 6089; AVX-NEXT: vpsllq $48, %xmm11, %xmm15 6090; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15 6091; AVX-NEXT: vorps %ymm3, %ymm15, %ymm3 6092; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 6093; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6094; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6095; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm10[2,3],xmm13[4,5,6,7] 6096; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,0,4,5,6,7] 6097; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] 6098; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6099; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,3,2,3] 6100; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 6101; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] 6102; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3],xmm7[4,5],xmm6[6,7] 6103; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 6104; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7] 6105; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6106; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,3,2,3] 6107; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 6108; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6109; AVX-NEXT: vpsrlq $48, %xmm10, %xmm15 6110; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1] 6111; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6112; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6113; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm4[0,1],xmm6[2,3],xmm4[4,5],xmm6[6,7] 6114; AVX-NEXT: vpshufb %xmm1, %xmm15, %xmm15 6115; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm15[3,4,5,6,7] 6116; AVX-NEXT: vandps %ymm5, %ymm3, %ymm3 6117; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 6118; AVX-NEXT: vpsllq $48, %xmm7, %xmm15 6119; AVX-NEXT: vandnps %ymm15, %ymm5, %ymm15 6120; AVX-NEXT: vorps %ymm3, %ymm15, %ymm3 6121; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm3, %ymm0 6122; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6123; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload 6124; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm8[4,5],mem[6,7] 6125; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0 6126; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6127; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm2 # 16-byte Folded Reload 6128; AVX-NEXT: # xmm2 = mem[0,1],xmm8[2,3],mem[4,5,6,7] 6129; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,0,4,5,6,7] 6130; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5] 6131; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 6132; AVX-NEXT: # xmm3 = mem[0,3,2,3] 6133; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,2,2,3,4,5,6,7] 6134; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3,4,5,6,7] 6135; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4],xmm0[5,6,7] 6136; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6137; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6138; AVX-NEXT: # xmm2 = xmm2[0,1],mem[2,3],xmm2[4,5],mem[6,7] 6139; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 6140; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6141; AVX-NEXT: # xmm2 = mem[0,3,2,3] 6142; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,2,2,3,4,5,6,7] 6143; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6144; AVX-NEXT: vpsrlq $48, %xmm3, %xmm3 6145; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 6146; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5,6,7] 6147; AVX-NEXT: vandps %ymm5, %ymm1, %ymm1 6148; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6149; AVX-NEXT: vpsllq $48, %xmm2, %xmm2 6150; AVX-NEXT: vandnps %ymm2, %ymm5, %ymm2 6151; AVX-NEXT: vorps %ymm2, %ymm1, %ymm1 6152; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 6153; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6154; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6155; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 6156; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7] 6157; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13] 6158; AVX-NEXT: vpshufb %xmm0, %xmm1, %xmm1 6159; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6160; AVX-NEXT: # xmm2 = mem[3,1,2,3] 6161; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7] 6162; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3,4,5,6,7] 6163; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6164; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 6165; AVX-NEXT: # xmm3 = xmm2[0,1],mem[2,3],xmm2[4,5,6,7] 6166; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 6167; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 6168; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] 6169; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 6170; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload 6171; AVX-NEXT: # xmm5 = mem[0,1],xmm1[2,3],mem[4,5,6,7] 6172; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,12,13,6,7,12,13,14,15] 6173; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 6174; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6175; AVX-NEXT: # xmm15 = mem[0,1,1,3] 6176; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] 6177; AVX-NEXT: vpunpckhdq (%rsp), %xmm15, %xmm15 # 16-byte Folded Reload 6178; AVX-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3] 6179; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] 6180; AVX-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6181; AVX-NEXT: # xmm15 = mem[0,1,2,0] 6182; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] 6183; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] 6184; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 6185; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6186; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6187; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload 6188; AVX-NEXT: # xmm3 = mem[0,1,2,3],xmm3[4,5],mem[6,7] 6189; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 6190; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 6191; AVX-NEXT: # xmm5 = mem[3,1,2,3] 6192; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] 6193; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] 6194; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload 6195; AVX-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] 6196; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 6197; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 6198; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm5 # 16-byte Folded Reload 6199; AVX-NEXT: # xmm5 = mem[0,1],xmm9[2,3],mem[4,5,6,7] 6200; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 6201; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6202; AVX-NEXT: # xmm15 = mem[0,1,1,3] 6203; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] 6204; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6205; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3] 6206; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] 6207; AVX-NEXT: vpshufd $36, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6208; AVX-NEXT: # xmm15 = mem[0,1,2,0] 6209; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] 6210; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] 6211; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 6212; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6213; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload 6214; AVX-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] 6215; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm3 6216; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm12[3,1,2,3] 6217; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7] 6218; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3,4,5,6,7] 6219; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6220; AVX-NEXT: vpblendw $243, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 6221; AVX-NEXT: # xmm5 = mem[0,1],xmm5[2,3],mem[4,5,6,7] 6222; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 6223; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 6224; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] 6225; AVX-NEXT: vpshufb %xmm1, %xmm5, %xmm5 6226; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[0,1,1,3] 6227; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,7,6,7] 6228; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm10[2],xmm15[3],xmm10[3] 6229; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0,1,2],xmm5[3,4,5],xmm15[6,7] 6230; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[0,1,2,0] 6231; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,6,5] 6232; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] 6233; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 6234; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6235; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6236; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6237; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6,7] 6238; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 6239; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 6240; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm8[4,5],xmm7[6,7] 6241; AVX-NEXT: vpshufb %xmm0, %xmm3, %xmm0 6242; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6243; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3] 6244; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7] 6245; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3,4,5,6,7] 6246; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 6247; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6248; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6249; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7] 6250; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm1 6251; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6252; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[0,1,1,3] 6253; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7] 6254; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6255; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] 6256; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6,7] 6257; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6258; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,2,0] 6259; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,5] 6260; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] 6261; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 6262; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6263; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6264; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload 6265; AVX-NEXT: # xmm1 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] 6266; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] 6267; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6268; AVX-NEXT: vpsrlq $48, %xmm2, %xmm2 6269; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] 6270; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6271; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload 6272; AVX-NEXT: # xmm3 = xmm2[0,1,2,3],mem[4,5],xmm2[6,7] 6273; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 6274; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm3 6275; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3,4],xmm3[5,6,7] 6276; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 6277; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload 6278; AVX-NEXT: # xmm5 = xmm1[0,1],mem[2,3],xmm1[4,5,6,7] 6279; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] 6280; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6281; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm15 # 16-byte Folded Reload 6282; AVX-NEXT: # xmm15 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] 6283; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] 6284; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] 6285; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] 6286; AVX-NEXT: vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6287; AVX-NEXT: # xmm15 = mem[0,1,0,3] 6288; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] 6289; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] 6290; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 6291; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6292; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6293; AVX-NEXT: vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload 6294; AVX-NEXT: # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7] 6295; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15] 6296; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 6297; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6298; AVX-NEXT: vpsrlq $48, %xmm0, %xmm5 6299; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] 6300; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 6301; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload 6302; AVX-NEXT: # xmm5 = xmm0[0,1,2,3],mem[4,5],xmm0[6,7] 6303; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 6304; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 6305; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm5 # 16-byte Folded Reload 6306; AVX-NEXT: # xmm5 = xmm14[0,1],mem[2,3],xmm14[4,5,6,7] 6307; AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,10,11,8,9,10,11,12,13,14,15] 6308; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 6309; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6310; AVX-NEXT: vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm15 # 16-byte Folded Reload 6311; AVX-NEXT: # xmm15 = mem[0,1,2,3],xmm14[4,5],mem[6,7] 6312; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] 6313; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] 6314; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] 6315; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6316; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[0,1,0,3] 6317; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] 6318; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] 6319; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 6320; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6321; AVX-NEXT: vpblendw $51, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload 6322; AVX-NEXT: # xmm3 = mem[0,1],xmm7[2,3],mem[4,5],xmm7[6,7] 6323; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm3 6324; AVX-NEXT: vpsrlq $48, %xmm8, %xmm5 6325; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1,2,3,4,5,6,7] 6326; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm13[4,5],xmm12[6,7] 6327; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 6328; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7] 6329; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0,1],xmm6[2,3],xmm4[4,5,6,7] 6330; AVX-NEXT: vpshufb %xmm0, %xmm5, %xmm5 6331; AVX-NEXT: vmovdqa %xmm0, %xmm4 6332; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3],xmm11[4,5],xmm10[6,7] 6333; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[2,2,2,2,4,5,6,7] 6334; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,7,4,6,7] 6335; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm15[3,4,5],xmm5[6,7] 6336; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm9[0,1,0,3] 6337; AVX-NEXT: vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,5,5,6] 6338; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm15[6,7] 6339; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm5, %ymm3 6340; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6341; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 6342; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm3 # 16-byte Folded Reload 6343; AVX-NEXT: # xmm3 = xmm13[0,1,2,3],mem[4,5],xmm13[6,7] 6344; AVX-NEXT: vpshufb %xmm2, %xmm3, %xmm2 6345; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6346; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6347; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm10[0,1],xmm11[2,3],xmm10[4,5],xmm11[6,7] 6348; AVX-NEXT: vpshufb %xmm1, %xmm3, %xmm0 6349; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 6350; AVX-NEXT: vpsrlq $48, %xmm7, %xmm3 6351; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7] 6352; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7] 6353; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 6354; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6355; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0,1],xmm3[2,3],xmm6[4,5,6,7] 6356; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm1 6357; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6358; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 6359; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm8[4,5],xmm4[6,7] 6360; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,2,2,2,4,5,6,7] 6361; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,4,6,7] 6362; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6,7] 6363; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 6364; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,3] 6365; AVX-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,5,6] 6366; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6,7] 6367; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 6368; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6369; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6370; AVX-NEXT: # xmm0 = mem[3,1,2,3] 6371; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 6372; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6373; AVX-NEXT: # xmm1 = mem[0,2,2,3] 6374; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 6375; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 6376; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 6377; AVX-NEXT: # xmm1 = mem[0,3,2,3] 6378; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6379; AVX-NEXT: # xmm1 = xmm1[0,1,2],mem[3],xmm1[4,5,6,7] 6380; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6381; AVX-NEXT: # xmm2 = mem[2,3,2,3] 6382; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7] 6383; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] 6384; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 6385; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 6386; AVX-NEXT: # xmm1 = xmm1[0,1,2,3],mem[4,5],xmm1[6,7] 6387; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6388; AVX-NEXT: # xmm2 = mem[1,1,1,1] 6389; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 6390; AVX-NEXT: # xmm5 = mem[0,2,2,3] 6391; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] 6392; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1] 6393; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [8,9,2,3,12,13,12,13,8,9,12,13,12,13,14,15] 6394; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1 6395; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3,4,5],xmm1[6,7] 6396; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 6397; AVX-NEXT: # xmm5 = mem[0,1,1,3] 6398; AVX-NEXT: vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,4,7] 6399; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm5[6,7] 6400; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm1 6401; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 6402; AVX-NEXT: # xmm0 = mem[3,1,2,3] 6403; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 6404; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 6405; AVX-NEXT: # xmm5 = mem[0,2,2,3] 6406; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] 6407; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 6408; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 6409; AVX-NEXT: # xmm5 = mem[0,3,2,3] 6410; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 6411; AVX-NEXT: # xmm5 = xmm5[0,1,2],mem[3],xmm5[4,5,6,7] 6412; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6413; AVX-NEXT: # xmm15 = mem[2,3,2,3] 6414; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm15[0],xmm5[1,2,3,4,5,6,7] 6415; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3],xmm0[4,5,6,7] 6416; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 6417; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload 6418; AVX-NEXT: # xmm5 = xmm5[0,1,2,3],mem[4,5],xmm5[6,7] 6419; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 6420; AVX-NEXT: # xmm15 = mem[1,1,1,1] 6421; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 6422; AVX-NEXT: # xmm12 = mem[0,2,2,3] 6423; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] 6424; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1] 6425; AVX-NEXT: vpshufb %xmm2, %xmm5, %xmm5 6426; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm12[3,4,5],xmm5[6,7] 6427; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[0,1,1,3] 6428; AVX-NEXT: vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5,4,7] 6429; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5],xmm12[6,7] 6430; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0 6431; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm13[3,1,2,3] 6432; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] 6433; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 6434; AVX-NEXT: # xmm12 = mem[0,2,2,3] 6435; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,0,3,4,5,6,7] 6436; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm12[0],xmm5[0],xmm12[1],xmm5[1] 6437; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,3,2,3] 6438; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm11[3],xmm12[4,5,6,7] 6439; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm7[2,3,2,3] 6440; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm15[0],xmm12[1,2,3,4,5,6,7] 6441; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm12[0,1,2,3],xmm5[4,5,6,7] 6442; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm6[0,1,2,3],xmm3[4,5],xmm6[6,7] 6443; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm4[1,1,1,1] 6444; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[0,2,2,3] 6445; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,1,0,3,4,5,6,7] 6446; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1] 6447; AVX-NEXT: vpshufb %xmm2, %xmm12, %xmm12 6448; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm14[3,4,5],xmm12[6,7] 6449; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[0,1,1,3] 6450; AVX-NEXT: vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,5,4,7] 6451; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,5],xmm14[6,7] 6452; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm12, %ymm12 6453; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 6454; AVX-NEXT: # xmm5 = mem[3,1,2,3] 6455; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] 6456; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 6457; AVX-NEXT: # xmm13 = mem[0,2,2,3] 6458; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[0,1,0,3,4,5,6,7] 6459; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm5[0],xmm13[1],xmm5[1] 6460; AVX-NEXT: vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 6461; AVX-NEXT: # xmm13 = mem[0,3,2,3] 6462; AVX-NEXT: vpblendw $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm11 # 16-byte Folded Reload 6463; AVX-NEXT: # xmm11 = xmm13[0,1,2],mem[3],xmm13[4,5,6,7] 6464; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 6465; AVX-NEXT: # xmm10 = mem[2,3,2,3] 6466; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0],xmm11[1,2,3,4,5,6,7] 6467; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm10[0,1,2,3],xmm5[4,5,6,7] 6468; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 6469; AVX-NEXT: vpblendw $48, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm8 # 16-byte Folded Reload 6470; AVX-NEXT: # xmm8 = xmm3[0,1,2,3],mem[4,5],xmm3[6,7] 6471; AVX-NEXT: vpshufb %xmm2, %xmm8, %xmm2 6472; AVX-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6473; AVX-NEXT: # xmm4 = mem[1,1,1,1] 6474; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 6475; AVX-NEXT: # xmm7 = mem[0,2,2,3] 6476; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7] 6477; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm4[0],xmm7[1],xmm4[1] 6478; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5],xmm2[6,7] 6479; AVX-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6480; AVX-NEXT: # xmm4 = mem[0,1,1,3] 6481; AVX-NEXT: vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,4,7] 6482; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm4[6,7] 6483; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 6484; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6485; AVX-NEXT: vmovaps %ymm3, 64(%rsi) 6486; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6487; AVX-NEXT: vmovaps %ymm4, (%rsi) 6488; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6489; AVX-NEXT: vmovaps %ymm4, 96(%rsi) 6490; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6491; AVX-NEXT: vmovaps %ymm4, 32(%rsi) 6492; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6493; AVX-NEXT: vmovaps %ymm3, 64(%rdx) 6494; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6495; AVX-NEXT: vmovaps %ymm3, (%rdx) 6496; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6497; AVX-NEXT: vmovaps %ymm3, 96(%rdx) 6498; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6499; AVX-NEXT: vmovaps %ymm3, 32(%rdx) 6500; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6501; AVX-NEXT: vmovaps %ymm3, 64(%rcx) 6502; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6503; AVX-NEXT: vmovaps %ymm3, (%rcx) 6504; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6505; AVX-NEXT: vmovaps %ymm3, 96(%rcx) 6506; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6507; AVX-NEXT: vmovaps %ymm3, 32(%rcx) 6508; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6509; AVX-NEXT: vmovaps %ymm3, 64(%r8) 6510; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6511; AVX-NEXT: vmovaps %ymm3, (%r8) 6512; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6513; AVX-NEXT: vmovaps %ymm3, 96(%r8) 6514; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6515; AVX-NEXT: vmovaps %ymm3, 32(%r8) 6516; AVX-NEXT: vmovaps %ymm2, 64(%r9) 6517; AVX-NEXT: vmovaps %ymm12, (%r9) 6518; AVX-NEXT: vmovaps %ymm0, 96(%r9) 6519; AVX-NEXT: vmovaps %ymm1, 32(%r9) 6520; AVX-NEXT: addq $1032, %rsp # imm = 0x408 6521; AVX-NEXT: vzeroupper 6522; AVX-NEXT: retq 6523; 6524; AVX2-LABEL: load_i16_stride5_vf64: 6525; AVX2: # %bb.0: 6526; AVX2-NEXT: subq $1048, %rsp # imm = 0x418 6527; AVX2-NEXT: vmovdqa 384(%rdi), %ymm10 6528; AVX2-NEXT: vmovdqa 512(%rdi), %ymm4 6529; AVX2-NEXT: vmovdqa 480(%rdi), %ymm14 6530; AVX2-NEXT: vmovdqa 544(%rdi), %ymm11 6531; AVX2-NEXT: vmovdqa 576(%rdi), %ymm8 6532; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6533; AVX2-NEXT: vmovdqa 192(%rdi), %ymm3 6534; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6535; AVX2-NEXT: vmovdqa 160(%rdi), %ymm5 6536; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6537; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0 6538; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6539; AVX2-NEXT: vmovdqa 256(%rdi), %ymm1 6540; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill 6541; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15] 6542; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6543; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] 6544; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 6545; AVX2-NEXT: vpshufb %ymm1, %ymm0, %ymm2 6546; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm5[0],ymm3[1],ymm5[2,3],ymm3[4],ymm5[5],ymm3[6],ymm5[7,8],ymm3[9],ymm5[10,11],ymm3[12],ymm5[13],ymm3[14],ymm5[15] 6547; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 6548; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1,2,3],xmm0[4,5],xmm3[6,7] 6549; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 6550; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3 6551; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm5 = [65535,65535,65535,65535,65535,65535,65535,0] 6552; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 6553; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6554; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm8[1,2],ymm11[3],ymm8[4],ymm11[5],ymm8[6,7],ymm11[8],ymm8[9,10],ymm11[11],ymm8[12],ymm11[13],ymm8[14,15] 6555; AVX2-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6556; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 6557; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] 6558; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] 6559; AVX2-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6560; AVX2-NEXT: vmovdqa %ymm4, %ymm8 6561; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6562; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 6563; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] 6564; AVX2-NEXT: vmovdqa 416(%rdi), %ymm13 6565; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 6566; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3 6567; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 6568; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6569; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm13[1,2],ymm10[3],ymm13[4],ymm10[5],ymm13[6,7],ymm10[8],ymm13[9,10],ymm10[11],ymm13[12],ymm10[13],ymm13[14,15] 6570; AVX2-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6571; AVX2-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6572; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 6573; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] 6574; AVX2-NEXT: vmovdqa 352(%rdi), %ymm4 6575; AVX2-NEXT: vmovdqa 320(%rdi), %ymm15 6576; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5],ymm4[6],ymm15[7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13],ymm4[14],ymm15[15] 6577; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6578; AVX2-NEXT: vmovdqa %ymm4, %ymm6 6579; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6580; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4 6581; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4,5],xmm4[6,7] 6582; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 6583; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm3 6584; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm2, %ymm2 6585; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6586; AVX2-NEXT: vmovdqa 64(%rdi), %ymm9 6587; AVX2-NEXT: vmovdqa 96(%rdi), %ymm12 6588; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm12[1,2],ymm9[3],ymm12[4],ymm9[5],ymm12[6,7],ymm9[8],ymm12[9,10],ymm9[11],ymm12[12],ymm9[13],ymm12[14,15] 6589; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6590; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6591; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 6592; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] 6593; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 6594; AVX2-NEXT: vmovdqa (%rdi), %ymm5 6595; AVX2-NEXT: vmovdqa 32(%rdi), %ymm7 6596; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm7[1],ymm5[2,3],ymm7[4],ymm5[5],ymm7[6],ymm5[7,8],ymm7[9],ymm5[10,11],ymm7[12],ymm5[13],ymm7[14],ymm5[15] 6597; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6598; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6599; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 6600; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 6601; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm0 6602; AVX2-NEXT: vpmovsxbw {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0] 6603; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm1, %ymm0 6604; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6605; AVX2-NEXT: vmovdqu (%rsp), %ymm0 # 32-byte Reload 6606; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6607; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] 6608; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6609; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 6610; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6611; AVX2-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 6612; AVX2-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] 6613; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2 6614; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6],xmm2[7] 6615; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 6616; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm1 6617; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 6618; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 6619; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 6620; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6621; AVX2-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm1 # 32-byte Folded Reload 6622; AVX2-NEXT: # ymm1 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] 6623; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] 6624; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7] 6625; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm14[1],ymm8[2],ymm14[3],ymm8[4,5],ymm14[6],ymm8[7,8],ymm14[9],ymm8[10],ymm14[11],ymm8[12,13],ymm14[14],ymm8[15] 6626; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm8 6627; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3],xmm2[4,5,6],xmm8[7] 6628; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm1 6629; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 6630; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm11 6631; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5],ymm10[6],ymm13[7,8],ymm10[9],ymm13[10,11],ymm10[12],ymm13[13],ymm10[14],ymm13[15] 6632; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm1[2,3,0,1] 6633; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm8[5],ymm1[6,7] 6634; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm6[0],ymm15[1],ymm6[2],ymm15[3],ymm6[4,5],ymm15[6],ymm6[7,8],ymm15[9],ymm6[10],ymm15[11],ymm6[12,13],ymm15[14],ymm6[15] 6635; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 6636; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] 6637; AVX2-NEXT: vpshufb %ymm0, %ymm1, %ymm1 6638; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm8 6639; AVX2-NEXT: vpblendvb %ymm3, %ymm8, %ymm1, %ymm6 6640; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm9[1],ymm12[2,3],ymm9[4],ymm12[5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10,11],ymm9[12],ymm12[13],ymm9[14],ymm12[15] 6641; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm8[2,3,0,1] 6642; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5],ymm8[6,7] 6643; AVX2-NEXT: vpshufb %ymm0, %ymm8, %ymm0 6644; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm7[0],ymm5[1],ymm7[2],ymm5[3],ymm7[4,5],ymm5[6],ymm7[7,8],ymm5[9],ymm7[10],ymm5[11],ymm7[12,13],ymm5[14],ymm7[15] 6645; AVX2-NEXT: vextracti128 $1, %ymm8, %xmm10 6646; AVX2-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm10[2,3],xmm8[4,5,6],xmm10[7] 6647; AVX2-NEXT: vpshufb %xmm4, %xmm8, %xmm4 6648; AVX2-NEXT: vpblendvb %ymm3, %ymm4, %ymm0, %ymm2 6649; AVX2-NEXT: vmovdqa 304(%rdi), %xmm9 6650; AVX2-NEXT: vmovdqa 288(%rdi), %xmm3 6651; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm3[0],xmm9[1],xmm3[2,3] 6652; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 6653; AVX2-NEXT: vpshufb %xmm0, %xmm8, %xmm8 6654; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 6655; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6656; AVX2-NEXT: vpblendw {{.*#+}} ymm8 = ymm4[0,1,2,3,4],ymm8[5,6,7],ymm4[8,9,10,11,12],ymm8[13,14,15] 6657; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7] 6658; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6659; AVX2-NEXT: vmovdqa 624(%rdi), %xmm15 6660; AVX2-NEXT: vmovdqa 608(%rdi), %xmm12 6661; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0],xmm15[1],xmm12[2,3] 6662; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6663; AVX2-NEXT: vpshufb %xmm0, %xmm10, %xmm10 6664; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 6665; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6666; AVX2-NEXT: vpblendw {{.*#+}} ymm10 = ymm4[0,1,2,3,4],ymm10[5,6,7],ymm4[8,9,10,11,12],ymm10[13,14,15] 6667; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7] 6668; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6669; AVX2-NEXT: vmovdqa 464(%rdi), %xmm10 6670; AVX2-NEXT: vmovdqa 448(%rdi), %xmm8 6671; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm10[1],xmm8[2,3] 6672; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6673; AVX2-NEXT: vpshufb %xmm0, %xmm13, %xmm13 6674; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 6675; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6676; AVX2-NEXT: vpblendw {{.*#+}} ymm13 = ymm4[0,1,2,3,4],ymm13[5,6,7],ymm4[8,9,10,11,12],ymm13[13,14,15] 6677; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7] 6678; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6679; AVX2-NEXT: vmovdqa 144(%rdi), %xmm5 6680; AVX2-NEXT: vmovdqa 128(%rdi), %xmm4 6681; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0],xmm5[1],xmm4[2,3] 6682; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6683; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6684; AVX2-NEXT: vpshufb %xmm0, %xmm14, %xmm0 6685; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6686; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 6687; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm7[0,1,2,3,4],ymm0[5,6,7],ymm7[8,9,10,11,12],ymm0[13,14,15] 6688; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7] 6689; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6690; AVX2-NEXT: vmovdqa %xmm3, %xmm14 6691; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6692; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm9[2],xmm3[3] 6693; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6694; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 6695; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 6696; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6697; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6698; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] 6699; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 6700; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6701; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm12[0,1],xmm15[2],xmm12[3] 6702; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6703; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 6704; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6705; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5,6,7],ymm11[8,9,10,11,12],ymm0[13,14,15] 6706; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7] 6707; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6708; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] 6709; AVX2-NEXT: vmovdqa %xmm8, %xmm10 6710; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 6711; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 6712; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6713; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5,6,7],ymm6[8,9,10,11,12],ymm0[13,14,15] 6714; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7] 6715; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6716; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm5[2],xmm4[3] 6717; AVX2-NEXT: vpshufb %xmm7, %xmm0, %xmm0 6718; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 6719; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] 6720; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 6721; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6722; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6723; AVX2-NEXT: vpblendw $82, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload 6724; AVX2-NEXT: # ymm0 = ymm13[0],mem[1],ymm13[2,3],mem[4],ymm13[5],mem[6],ymm13[7,8],mem[9],ymm13[10,11],mem[12],ymm13[13],mem[14],ymm13[15] 6725; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6726; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 6727; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 6728; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload 6729; AVX2-NEXT: # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] 6730; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 6731; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 6732; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 6733; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm3 6734; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 6735; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 6736; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] 6737; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3] 6738; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 6739; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 6740; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6741; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] 6742; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 6743; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6744; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 6745; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 6746; AVX2-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] 6747; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 6748; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] 6749; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 6750; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload 6751; AVX2-NEXT: # ymm6 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] 6752; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 6753; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] 6754; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 6755; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6 6756; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] 6757; AVX2-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm6 # 16-byte Folded Reload 6758; AVX2-NEXT: # xmm6 = mem[0],xmm12[1],mem[2,3] 6759; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 6760; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6761; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] 6762; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 6763; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6764; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 6765; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6766; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm14[0],ymm4[1],ymm14[2,3],ymm4[4],ymm14[5],ymm4[6],ymm14[7,8],ymm4[9],ymm14[10,11],ymm4[12],ymm14[13],ymm4[14],ymm14[15] 6767; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 6768; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] 6769; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 6770; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 6771; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15] 6772; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 6773; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm7[3,4],xmm6[5,6,7] 6774; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 6775; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6 6776; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] 6777; AVX2-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm6 # 16-byte Folded Reload 6778; AVX2-NEXT: # xmm6 = mem[0],xmm10[1],mem[2,3] 6779; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 6780; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6781; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] 6782; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 6783; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6784; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 6785; AVX2-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm3 # 32-byte Folded Reload 6786; AVX2-NEXT: # ymm3 = mem[0],ymm12[1],mem[2,3],ymm12[4],mem[5],ymm12[6],mem[7,8],ymm12[9],mem[10,11],ymm12[12],mem[13],ymm12[14],mem[15] 6787; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 6788; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7] 6789; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 6790; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 6791; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6792; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm5[2],ymm10[3],ymm5[4],ymm10[5,6],ymm5[7],ymm10[8,9],ymm5[10],ymm10[11],ymm5[12],ymm10[13,14],ymm5[15] 6793; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 6794; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm6[3,4],xmm3[5,6,7] 6795; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 6796; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 6797; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 6798; AVX2-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 6799; AVX2-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] 6800; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 6801; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6802; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 6803; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6804; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6805; AVX2-NEXT: vpblendw $181, (%rsp), %ymm13, %ymm0 # 32-byte Folded Reload 6806; AVX2-NEXT: # ymm0 = mem[0],ymm13[1],mem[2],ymm13[3],mem[4,5],ymm13[6],mem[7,8],ymm13[9],mem[10],ymm13[11],mem[12,13],ymm13[14],mem[15] 6807; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 6808; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 6809; AVX2-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload 6810; AVX2-NEXT: # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15] 6811; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 6812; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 6813; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 6814; AVX2-NEXT: vpshufb %ymm2, %ymm0, %ymm3 6815; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 6816; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1 6817; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7] 6818; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 6819; AVX2-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm6 # 16-byte Folded Reload 6820; AVX2-NEXT: # xmm6 = mem[0,1],xmm15[2],mem[3] 6821; AVX2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 6822; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 6823; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6824; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] 6825; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 6826; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6827; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 6828; AVX2-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm3 # 32-byte Folded Reload 6829; AVX2-NEXT: # ymm3 = ymm13[0],mem[1],ymm13[2],mem[3],ymm13[4,5],mem[6],ymm13[7,8],mem[9],ymm13[10],mem[11],ymm13[12,13],mem[14],ymm13[15] 6830; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 6831; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] 6832; AVX2-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm6 # 32-byte Folded Reload 6833; AVX2-NEXT: # ymm6 = mem[0],ymm11[1,2],mem[3],ymm11[4],mem[5],ymm11[6,7],mem[8],ymm11[9,10],mem[11],ymm11[12],mem[13],ymm11[14,15] 6834; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 6835; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 6836; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 6837; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6 6838; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] 6839; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload 6840; AVX2-NEXT: vpblendd $11, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm6 # 16-byte Folded Reload 6841; AVX2-NEXT: # xmm6 = mem[0,1],xmm11[2],mem[3] 6842; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 6843; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6844; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] 6845; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 6846; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6847; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1],ymm4[2],ymm14[3],ymm4[4,5],ymm14[6],ymm4[7,8],ymm14[9],ymm4[10],ymm14[11],ymm4[12,13],ymm14[14],ymm4[15] 6848; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 6849; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] 6850; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15] 6851; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 6852; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3] 6853; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm3 6854; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm6 6855; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] 6856; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 6857; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload 6858; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm8[0,1],xmm14[2],xmm8[3] 6859; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm6 6860; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 6861; AVX2-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] 6862; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 6863; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 6864; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 6865; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm12[0],ymm9[1],ymm12[2],ymm9[3],ymm12[4,5],ymm9[6],ymm12[7,8],ymm9[9],ymm12[10],ymm9[11],ymm12[12,13],ymm9[14],ymm12[15] 6866; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 6867; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7] 6868; AVX2-NEXT: vpshufb %ymm2, %ymm3, %ymm2 6869; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] 6870; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm6 6871; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2],xmm3[3] 6872; AVX2-NEXT: vpshufb %xmm0, %xmm3, %xmm0 6873; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 6874; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 6875; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload 6876; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm12[2],xmm10[3] 6877; AVX2-NEXT: vpshufb %xmm1, %xmm2, %xmm1 6878; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 6879; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 6880; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 6881; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 6882; AVX2-NEXT: vpblendw $107, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 6883; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3],ymm0[4],mem[5,6],ymm0[7],mem[8,9],ymm0[10],mem[11],ymm0[12],mem[13,14],ymm0[15] 6884; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 6885; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6],ymm2[7] 6886; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 6887; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 6888; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] 6889; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 6890; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7] 6891; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] 6892; AVX2-NEXT: # ymm7 = mem[0,1,0,1] 6893; AVX2-NEXT: vpshufb %ymm7, %ymm0, %ymm0 6894; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 6895; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 6896; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7] 6897; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 6898; AVX2-NEXT: # xmm2 = mem[3,1,2,3] 6899; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7] 6900; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3] 6901; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,0,3,4,5,6,7] 6902; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1] 6903; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 6904; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] 6905; AVX2-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm2 # 32-byte Folded Reload 6906; AVX2-NEXT: # ymm2 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] 6907; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] 6908; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] 6909; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6910; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 6911; AVX2-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] 6912; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm5 6913; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3,4],xmm5[5,6,7] 6914; AVX2-NEXT: vpshufb %ymm7, %ymm2, %ymm2 6915; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4 6916; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] 6917; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 6918; AVX2-NEXT: # xmm4 = mem[3,1,2,3] 6919; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,2,1,4,5,6,7] 6920; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm11[0,2,2,3] 6921; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,0,3,4,5,6,7] 6922; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 6923; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 6924; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] 6925; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 6926; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 6927; AVX2-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3],ymm4[4],mem[5,6],ymm4[7],mem[8,9],ymm4[10],mem[11],ymm4[12],mem[13,14],ymm4[15] 6928; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 6929; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] 6930; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6931; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 6932; AVX2-NEXT: # ymm5 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] 6933; AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 6934; AVX2-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] 6935; AVX2-NEXT: vpshufb %ymm7, %ymm4, %ymm4 6936; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5 6937; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] 6938; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[3,1,2,3] 6939; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] 6940; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm14[0,2,2,3] 6941; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] 6942; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 6943; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 6944; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7] 6945; AVX2-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm5 # 32-byte Folded Reload 6946; AVX2-NEXT: # ymm5 = mem[0,1],ymm9[2],mem[3],ymm9[4],mem[5,6],ymm9[7],mem[8,9],ymm9[10],mem[11],ymm9[12],mem[13,14],ymm9[15] 6947; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1] 6948; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4],ymm5[5,6],ymm6[7] 6949; AVX2-NEXT: vpshufb %ymm7, %ymm5, %ymm5 6950; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 6951; AVX2-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 6952; AVX2-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5],mem[6],ymm6[7,8],mem[9],ymm6[10,11],mem[12],ymm6[13],mem[14],ymm6[15] 6953; AVX2-NEXT: vextracti128 $1, %ymm6, %xmm7 6954; AVX2-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] 6955; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 6956; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] 6957; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm10[3,1,2,3] 6958; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,2,1,4,5,6,7] 6959; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[0,2,2,3] 6960; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7] 6961; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1] 6962; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 6963; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 6964; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6965; AVX2-NEXT: vmovaps %ymm5, 64(%rsi) 6966; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6967; AVX2-NEXT: vmovaps %ymm5, (%rsi) 6968; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6969; AVX2-NEXT: vmovaps %ymm5, 96(%rsi) 6970; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6971; AVX2-NEXT: vmovaps %ymm5, 32(%rsi) 6972; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6973; AVX2-NEXT: vmovaps %ymm5, 64(%rdx) 6974; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6975; AVX2-NEXT: vmovaps %ymm5, (%rdx) 6976; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6977; AVX2-NEXT: vmovaps %ymm5, 96(%rdx) 6978; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6979; AVX2-NEXT: vmovaps %ymm5, 32(%rdx) 6980; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6981; AVX2-NEXT: vmovaps %ymm5, 64(%rcx) 6982; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6983; AVX2-NEXT: vmovaps %ymm5, (%rcx) 6984; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6985; AVX2-NEXT: vmovaps %ymm5, 96(%rcx) 6986; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6987; AVX2-NEXT: vmovaps %ymm5, 32(%rcx) 6988; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 6989; AVX2-NEXT: vmovaps %ymm5, 64(%r8) 6990; AVX2-NEXT: vmovdqa %ymm1, (%r8) 6991; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6992; AVX2-NEXT: vmovaps %ymm1, 96(%r8) 6993; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 6994; AVX2-NEXT: vmovaps %ymm1, 32(%r8) 6995; AVX2-NEXT: vmovdqa %ymm4, 64(%r9) 6996; AVX2-NEXT: vmovdqa %ymm3, (%r9) 6997; AVX2-NEXT: vmovdqa %ymm2, 96(%r9) 6998; AVX2-NEXT: vmovdqa %ymm0, 32(%r9) 6999; AVX2-NEXT: addq $1048, %rsp # imm = 0x418 7000; AVX2-NEXT: vzeroupper 7001; AVX2-NEXT: retq 7002; 7003; AVX2-FP-LABEL: load_i16_stride5_vf64: 7004; AVX2-FP: # %bb.0: 7005; AVX2-FP-NEXT: subq $1080, %rsp # imm = 0x438 7006; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm13 7007; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm5 7008; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7009; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm6 7010; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7011; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm7 7012; AVX2-FP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7013; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm8 7014; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2 7015; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7016; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm15 7017; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1 7018; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7019; AVX2-FP-NEXT: vmovdqa 256(%rdi), %ymm0 7020; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7021; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4],ymm1[5],ymm0[6,7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12],ymm1[13],ymm0[14,15] 7022; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7023; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] 7024; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,10,11,4,5,6,7,16,17,26,27,20,21,30,31,24,25,26,27,20,21,22,23] 7025; AVX2-FP-NEXT: vpshufb %ymm1, %ymm0, %ymm3 7026; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0],ymm2[1],ymm15[2,3],ymm2[4],ymm15[5],ymm2[6],ymm15[7,8],ymm2[9],ymm15[10,11],ymm2[12],ymm15[13],ymm2[14],ymm15[15] 7027; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7028; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 7029; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4,5],xmm2[6,7] 7030; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 7031; AVX2-FP-NEXT: vpshufb %xmm0, %xmm2, %xmm4 7032; AVX2-FP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [65535,65535,65535,65535,65535,65535,65535,0] 7033; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 7034; AVX2-FP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 7035; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm7[0],ymm8[1,2],ymm7[3],ymm8[4],ymm7[5],ymm8[6,7],ymm7[8],ymm8[9,10],ymm7[11],ymm8[12],ymm7[13],ymm8[14,15] 7036; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7037; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 7038; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] 7039; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10,11],ymm5[12],ymm6[13],ymm5[14],ymm6[15] 7040; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6 7041; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] 7042; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm14 7043; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7044; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 7045; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm3 7046; AVX2-FP-NEXT: vmovdqa %ymm10, %ymm7 7047; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7048; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm14[1,2],ymm13[3],ymm14[4],ymm13[5],ymm14[6,7],ymm13[8],ymm14[9,10],ymm13[11],ymm14[12],ymm13[13],ymm14[14,15] 7049; AVX2-FP-NEXT: vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7050; AVX2-FP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7051; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[2,3,0,1] 7052; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6],ymm4[7] 7053; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm12 7054; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm11 7055; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm12[1],ymm11[2,3],ymm12[4],ymm11[5],ymm12[6],ymm11[7,8],ymm12[9],ymm11[10,11],ymm12[12],ymm11[13],ymm12[14],ymm11[15] 7056; AVX2-FP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7057; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7058; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm6 7059; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1,2,3],xmm4[4,5],xmm6[6,7] 7060; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7061; AVX2-FP-NEXT: vpshufb %xmm0, %xmm4, %xmm4 7062; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm4, %ymm3, %ymm2 7063; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7064; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm9 7065; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm10 7066; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] 7067; AVX2-FP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7068; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7069; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[2,3,0,1] 7070; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm6[5],ymm3[6],ymm6[7] 7071; AVX2-FP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 7072; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm4 7073; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm5 7074; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 7075; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7076; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7077; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm6 7078; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1,2,3],xmm3[4,5],xmm6[6,7] 7079; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 7080; AVX2-FP-NEXT: vmovdqa %ymm7, %ymm2 7081; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 7082; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7083; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7084; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7085; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5],ymm0[6],mem[7,8],ymm0[9],mem[10,11],ymm0[12],mem[13],ymm0[14],mem[15] 7086; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7087; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 7088; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload 7089; AVX2-FP-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] 7090; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm3 7091; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] 7092; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [2,3,12,13,6,7,0,1,10,11,6,7,8,9,8,9,18,19,28,29,22,23,16,17,26,27,22,23,24,25,24,25] 7093; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 7094; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 7095; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 7096; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 7097; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7098; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload 7099; AVX2-FP-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5],mem[6],ymm8[7,8],mem[9],ymm8[10,11],mem[12],ymm8[13],mem[14],ymm8[15] 7100; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7101; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 7102; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 7103; AVX2-FP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm1 # 32-byte Folded Reload 7104; AVX2-FP-NEXT: # ymm1 = mem[0],ymm15[1],mem[2],ymm15[3],mem[4,5],ymm15[6],mem[7,8],ymm15[9],mem[10],ymm15[11],mem[12,13],ymm15[14],mem[15] 7105; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm8 7106; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3],xmm1[4,5,6],xmm8[7] 7107; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 7108; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1 7109; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm3 7110; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5],ymm13[6],ymm14[7,8],ymm13[9],ymm14[10,11],ymm13[12],ymm14[13],ymm13[14],ymm14[15] 7111; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] 7112; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7] 7113; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm12[0],ymm11[1],ymm12[2],ymm11[3],ymm12[4,5],ymm11[6],ymm12[7,8],ymm11[9],ymm12[10],ymm11[11],ymm12[12,13],ymm11[14],ymm12[15] 7114; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11 7115; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] 7116; AVX2-FP-NEXT: vpshufb %ymm7, %ymm0, %ymm0 7117; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm8 7118; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm1 7119; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm8, %ymm0, %ymm2 7120; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] 7121; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm8[2,3,0,1] 7122; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm11[5],ymm8[6,7] 7123; AVX2-FP-NEXT: vpshufb %ymm7, %ymm8, %ymm7 7124; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 7125; AVX2-FP-NEXT: vextracti128 $1, %ymm8, %xmm11 7126; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm11[2,3],xmm8[4,5,6],xmm11[7] 7127; AVX2-FP-NEXT: vpshufb %xmm6, %xmm8, %xmm6 7128; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm6, %ymm7, %ymm12 7129; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm1 7130; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm9 7131; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm9[0],xmm1[1],xmm9[2,3] 7132; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 7133; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 7134; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7135; AVX2-FP-NEXT: vmovdqu (%rsp), %ymm4 # 32-byte Reload 7136; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] 7137; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 7138; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7139; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm10 7140; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm8 7141; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm8[0],xmm10[1],xmm8[2,3] 7142; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 7143; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7144; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7145; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] 7146; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 7147; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7148; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm6 7149; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm5 7150; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm5[0],xmm6[1],xmm5[2,3] 7151; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7152; AVX2-FP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 7153; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7 7154; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7155; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm4[0,1,2,3,4],ymm7[5,6,7],ymm4[8,9,10,11,12],ymm7[13,14,15] 7156; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm7[4,5,6,7] 7157; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7158; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm11 7159; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm7 7160; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0],xmm11[1],xmm7[2,3] 7161; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7162; AVX2-FP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7163; AVX2-FP-NEXT: vpshufb %xmm0, %xmm14, %xmm0 7164; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7165; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7166; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7],ymm4[8,9,10,11,12],ymm0[13,14,15] 7167; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 7168; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7169; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm4 7170; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7171; AVX2-FP-NEXT: vmovdqa %xmm1, %xmm14 7172; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7173; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm1[2],xmm9[3] 7174; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 7175; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 7176; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7177; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7178; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] 7179; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7180; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7181; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm10[2],xmm8[3] 7182; AVX2-FP-NEXT: vmovdqa %xmm10, %xmm13 7183; AVX2-FP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7184; AVX2-FP-NEXT: vmovdqa %xmm8, %xmm10 7185; AVX2-FP-NEXT: vmovdqa %xmm8, (%rsp) # 16-byte Spill 7186; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 7187; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7188; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7],ymm3[8,9,10,11,12],ymm0[13,14,15] 7189; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7] 7190; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7191; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm6[2],xmm5[3] 7192; AVX2-FP-NEXT: vmovdqa %xmm6, %xmm3 7193; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 7194; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 7195; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7196; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7],ymm2[8,9,10,11,12],ymm0[13,14,15] 7197; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 7198; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7199; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm11[2],xmm7[3] 7200; AVX2-FP-NEXT: vpshufb %xmm9, %xmm0, %xmm0 7201; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 7202; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7],ymm12[8,9,10,11,12],ymm0[13,14,15] 7203; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7] 7204; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7205; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7206; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload 7207; AVX2-FP-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] 7208; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7209; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7] 7210; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 7211; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload 7212; AVX2-FP-NEXT: # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15] 7213; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 7214; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7] 7215; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 7216; AVX2-FP-NEXT: vpshufb %ymm11, %ymm0, %ymm2 7217; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 7218; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 7219; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] 7220; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3] 7221; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 7222; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 7223; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7224; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] 7225; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 7226; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7227; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 7228; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7229; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm6[1],ymm4[2,3],ymm6[4],ymm4[5],ymm6[6],ymm4[7,8],ymm6[9],ymm4[10,11],ymm6[12],ymm4[13],ymm6[14],ymm4[15] 7230; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] 7231; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] 7232; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7233; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm7[0,1],ymm15[2],ymm7[3],ymm15[4],ymm7[5,6],ymm15[7],ymm7[8,9],ymm15[10],ymm7[11],ymm15[12],ymm7[13,14],ymm15[15] 7234; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12 7235; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] 7236; AVX2-FP-NEXT: vpshufb %ymm11, %ymm2, %ymm2 7237; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 7238; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] 7239; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0],xmm10[1],xmm13[2,3] 7240; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 7241; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7242; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] 7243; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 7244; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7245; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 7246; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 7247; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] 7248; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] 7249; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] 7250; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7251; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 7252; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15] 7253; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12 7254; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7] 7255; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 7256; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 7257; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 7258; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] 7259; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload 7260; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm3[0],xmm13[1],xmm3[2,3] 7261; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 7262; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7263; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] 7264; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 7265; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7266; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7267; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 7268; AVX2-FP-NEXT: # ymm2 = mem[0],ymm2[1],mem[2,3],ymm2[4],mem[5],ymm2[6],mem[7,8],ymm2[9],mem[10,11],ymm2[12],mem[13],ymm2[14],mem[15] 7269; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] 7270; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm9[6],ymm2[7] 7271; AVX2-FP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 7272; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7273; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 7274; AVX2-FP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] 7275; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm9 7276; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm9[3,4],xmm3[5,6,7] 7277; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 7278; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 7279; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 7280; AVX2-FP-NEXT: vpblendd $13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload 7281; AVX2-FP-NEXT: # xmm2 = mem[0],xmm2[1],mem[2,3] 7282; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 7283; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7284; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 7285; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7286; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7287; AVX2-FP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload 7288; AVX2-FP-NEXT: # ymm0 = ymm8[0],mem[1],ymm8[2],mem[3],ymm8[4,5],mem[6],ymm8[7,8],mem[9],ymm8[10],mem[11],ymm8[12,13],mem[14],ymm8[15] 7289; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7290; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] 7291; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7292; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15] 7293; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2 7294; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3] 7295; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 7296; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm2 7297; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 7298; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 7299; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7] 7300; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7301; AVX2-FP-NEXT: vpblendd $4, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm9 # 16-byte Folded Reload 7302; AVX2-FP-NEXT: # xmm9 = xmm1[0,1],mem[2],xmm1[3] 7303; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 7304; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 7305; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7306; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] 7307; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 7308; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7309; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm6[0],ymm4[1],ymm6[2],ymm4[3],ymm6[4,5],ymm4[6],ymm6[7,8],ymm4[9],ymm6[10],ymm4[11],ymm6[12,13],ymm4[14],ymm6[15] 7310; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] 7311; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] 7312; AVX2-FP-NEXT: vpblendw $41, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload 7313; AVX2-FP-NEXT: # ymm9 = mem[0],ymm7[1,2],mem[3],ymm7[4],mem[5],ymm7[6,7],mem[8],ymm7[9,10],mem[11],ymm7[12],mem[13],ymm7[14,15] 7314; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12 7315; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] 7316; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 7317; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 7318; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] 7319; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7320; AVX2-FP-NEXT: vpblendd $4, (%rsp), %xmm4, %xmm9 # 16-byte Folded Reload 7321; AVX2-FP-NEXT: # xmm9 = xmm4[0,1],mem[2],xmm4[3] 7322; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 7323; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7324; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] 7325; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 7326; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7327; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm14[0],ymm15[1],ymm14[2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7,8],ymm15[9],ymm14[10],ymm15[11],ymm14[12,13],ymm15[14],ymm14[15] 7328; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] 7329; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] 7330; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm11[0],ymm10[1,2],ymm11[3],ymm10[4],ymm11[5],ymm10[6,7],ymm11[8],ymm10[9,10],ymm11[11],ymm10[12],ymm11[13],ymm10[14,15] 7331; AVX2-FP-NEXT: vextracti128 $1, %ymm9, %xmm12 7332; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm12[0],xmm9[1],xmm12[2],xmm9[3] 7333; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 7334; AVX2-FP-NEXT: vpshufb %xmm0, %xmm9, %xmm9 7335; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7] 7336; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload 7337; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm15[0,1],xmm13[2],xmm15[3] 7338; AVX2-FP-NEXT: vpshufb %xmm1, %xmm9, %xmm9 7339; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 7340; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm9 = ymm2[0,1,2,3,4],ymm9[5,6,7],ymm2[8,9,10,11,12],ymm9[13,14,15] 7341; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4,5,6,7] 7342; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7343; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 7344; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 7345; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm7[1],ymm12[2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7,8],ymm7[9],ymm12[10],ymm7[11],ymm12[12,13],ymm7[14],ymm12[15] 7346; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm2[2,3,0,1] 7347; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm9[4],ymm2[5],ymm9[6],ymm2[7] 7348; AVX2-FP-NEXT: vpshufb %ymm3, %ymm2, %ymm2 7349; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 7350; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7351; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4],ymm10[5],ymm11[6,7],ymm10[8],ymm11[9,10],ymm10[11],ymm11[12],ymm10[13],ymm11[14,15] 7352; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm9 7353; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm9[0],xmm3[1],xmm9[2],xmm3[3] 7354; AVX2-FP-NEXT: vpshufb %xmm0, %xmm3, %xmm0 7355; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3,4,5,6,7] 7356; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7357; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 7358; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm5[2],xmm3[3] 7359; AVX2-FP-NEXT: vpshufb %xmm1, %xmm2, %xmm1 7360; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7361; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 7362; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7363; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7364; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7365; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7366; AVX2-FP-NEXT: # ymm0 = ymm0[0,1],mem[2],ymm0[3],mem[4],ymm0[5,6],mem[7],ymm0[8,9],mem[10],ymm0[11],mem[12],ymm0[13,14],mem[15] 7367; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7368; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4],ymm0[5,6],ymm1[7] 7369; AVX2-FP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload 7370; AVX2-FP-NEXT: # ymm0 = mem[0],ymm8[1],mem[2,3],ymm8[4],mem[5],ymm8[6],mem[7,8],ymm8[9],mem[10,11],ymm8[12],mem[13],ymm8[14],mem[15] 7371; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2 7372; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 7373; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15,24,25,18,19,28,29,22,23,0,0,0,0,4,5,14,15] 7374; AVX2-FP-NEXT: # ymm0 = mem[0,1,0,1] 7375; AVX2-FP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 7376; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm13 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 7377; AVX2-FP-NEXT: vpshufb %xmm13, %xmm2, %xmm2 7378; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3,4,5,6,7] 7379; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [12,13,14,15,4,5,14,15,8,9,10,11,12,13,14,15] 7380; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 7381; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm4 7382; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,0,1,10,11,8,9,10,11,12,13,14,15] 7383; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 7384; AVX2-FP-NEXT: vpshufb %xmm1, %xmm6, %xmm6 7385; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1] 7386; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 7387; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5],ymm4[6,7] 7388; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7389; AVX2-FP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload 7390; AVX2-FP-NEXT: # ymm2 = ymm2[0,1],mem[2],ymm2[3],mem[4],ymm2[5,6],mem[7],ymm2[8,9],mem[10],ymm2[11],mem[12],ymm2[13,14],mem[15] 7391; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1] 7392; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4],ymm2[5,6],ymm4[7] 7393; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7394; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 7395; AVX2-FP-NEXT: # ymm4 = ymm4[0],mem[1],ymm4[2,3],mem[4],ymm4[5],mem[6],ymm4[7,8],mem[9],ymm4[10,11],mem[12],ymm4[13],mem[14],ymm4[15] 7396; AVX2-FP-NEXT: vextracti128 $1, %ymm4, %xmm14 7397; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3,4],xmm14[5,6,7] 7398; AVX2-FP-NEXT: vpshufb %ymm0, %ymm2, %ymm2 7399; AVX2-FP-NEXT: vpshufb %xmm13, %xmm4, %xmm4 7400; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3,4,5,6,7] 7401; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 7402; AVX2-FP-NEXT: vpshufb %xmm9, %xmm4, %xmm4 7403; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm8 # 16-byte Reload 7404; AVX2-FP-NEXT: vpshufb %xmm1, %xmm8, %xmm8 7405; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1] 7406; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 7407; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7] 7408; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm4 7409; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm5 7410; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1] 7411; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm5 = ymm12[0,1],ymm7[2],ymm12[3],ymm7[4],ymm12[5,6],ymm7[7],ymm12[8,9],ymm7[10],ymm12[11],ymm7[12],ymm12[13,14],ymm7[15] 7412; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] 7413; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] 7414; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 7415; AVX2-FP-NEXT: vextracti128 $1, %ymm7, %xmm8 7416; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2],xmm7[3,4],xmm8[5,6,7] 7417; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm5 7418; AVX2-FP-NEXT: vpshufb %xmm13, %xmm7, %xmm7 7419; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3,4,5,6,7] 7420; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 7421; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7] 7422; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7423; AVX2-FP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload 7424; AVX2-FP-NEXT: # ymm5 = mem[0,1],ymm3[2],mem[3],ymm3[4],mem[5,6],ymm3[7],mem[8,9],ymm3[10],mem[11],ymm3[12],mem[13,14],ymm3[15] 7425; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm5[2,3,0,1] 7426; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4],ymm5[5,6],ymm7[7] 7427; AVX2-FP-NEXT: vpshufb %ymm0, %ymm5, %ymm0 7428; AVX2-FP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7429; AVX2-FP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm5 # 32-byte Folded Reload 7430; AVX2-FP-NEXT: # ymm5 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] 7431; AVX2-FP-NEXT: vextracti128 $1, %ymm5, %xmm7 7432; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm5 = xmm7[0,1,2],xmm5[3,4],xmm7[5,6,7] 7433; AVX2-FP-NEXT: vpshufb %xmm13, %xmm5, %xmm3 7434; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7] 7435; AVX2-FP-NEXT: vpshufb %xmm9, %xmm15, %xmm3 7436; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 7437; AVX2-FP-NEXT: vpshufb %xmm1, %xmm5, %xmm1 7438; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] 7439; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7440; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 7441; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7442; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi) 7443; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7444; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi) 7445; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7446; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi) 7447; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7448; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi) 7449; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7450; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx) 7451; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7452; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx) 7453; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7454; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx) 7455; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7456; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx) 7457; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7458; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) 7459; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7460; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx) 7461; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7462; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) 7463; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7464; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx) 7465; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7466; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8) 7467; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7468; AVX2-FP-NEXT: vmovaps %ymm1, (%r8) 7469; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7470; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8) 7471; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7472; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8) 7473; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9) 7474; AVX2-FP-NEXT: vmovdqa %ymm4, (%r9) 7475; AVX2-FP-NEXT: vmovdqa %ymm2, 96(%r9) 7476; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9) 7477; AVX2-FP-NEXT: addq $1080, %rsp # imm = 0x438 7478; AVX2-FP-NEXT: vzeroupper 7479; AVX2-FP-NEXT: retq 7480; 7481; AVX2-FCP-LABEL: load_i16_stride5_vf64: 7482; AVX2-FCP: # %bb.0: 7483; AVX2-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8 7484; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm9 7485; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7486; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm14 7487; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm11 7488; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm8 7489; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7490; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm10 7491; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7492; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm4 7493; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7494; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %ymm3 7495; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7496; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0 7497; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7498; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1 7499; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7500; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] 7501; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 7502; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3],xmm0[4,5],xmm1[6,7] 7503; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,6,7] 7504; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 7505; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2],ymm4[3],ymm3[4],ymm4[5],ymm3[6,7],ymm4[8],ymm3[9,10],ymm4[11],ymm3[12],ymm4[13],ymm3[14,15] 7506; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [1,3,0,2,4,6,1,3] 7507; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm4 7508; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,8,9,14,15,4,5,14,15,4,5,2,3,16,17,22,23,24,25,30,31,20,21,30,31,20,21,18,19] 7509; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 7510; AVX2-FCP-NEXT: vpmovsxbw {{.*#+}} xmm13 = [65535,65535,65535,65535,65535,65535,65535,0] 7511; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 7512; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7513; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5],ymm8[6],ymm10[7,8],ymm8[9],ymm10[10,11],ymm8[12],ymm10[13],ymm8[14],ymm10[15] 7514; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 7515; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] 7516; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 7517; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm14[0],ymm11[1,2],ymm14[3],ymm11[4],ymm14[5],ymm11[6,7],ymm14[8],ymm11[9,10],ymm14[11],ymm11[12],ymm14[13],ymm11[14,15] 7518; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7519; AVX2-FCP-NEXT: vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7520; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 7521; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 7522; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 7523; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7524; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm15 7525; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm15[1],ymm9[2,3],ymm15[4],ymm9[5],ymm15[6],ymm9[7,8],ymm15[9],ymm9[10,11],ymm15[12],ymm9[13],ymm15[14],ymm9[15] 7526; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7527; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 7528; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] 7529; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm8 7530; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm12 7531; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm8[0],ymm12[1,2],ymm8[3],ymm12[4],ymm8[5],ymm12[6,7],ymm8[8],ymm12[9,10],ymm8[11],ymm12[12],ymm8[13],ymm12[14,15] 7532; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7533; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7534; AVX2-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm4 7535; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 7536; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm2 7537; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm2, %ymm4, %ymm2 7538; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 7539; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm9 7540; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7 7541; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm7[0],ymm9[1],ymm7[2,3],ymm9[4],ymm7[5],ymm9[6],ymm7[7,8],ymm9[9],ymm7[10,11],ymm9[12],ymm7[13],ymm9[14],ymm7[15] 7542; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7543; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7544; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 7545; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] 7546; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 7547; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5 7548; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm10 7549; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm10[1,2],ymm5[3],ymm10[4],ymm5[5],ymm10[6,7],ymm5[8],ymm10[9,10],ymm5[11],ymm10[12],ymm5[13],ymm10[14,15] 7550; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7551; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7552; AVX2-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm1 7553; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 7554; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm1, %ymm0 7555; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7556; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7557; AVX2-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7558; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4,5],ymm0[6],mem[7,8],ymm0[9],mem[10],ymm0[11],mem[12,13],ymm0[14],mem[15] 7559; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 7560; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] 7561; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11] 7562; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 7563; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7564; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 7565; AVX2-FCP-NEXT: # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15] 7566; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [2,0,0,0,4,7,1,6] 7567; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm3 7568; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17] 7569; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7570; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm6 7571; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7572; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7573; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2],mem[3],ymm0[4,5],mem[6],ymm0[7,8],mem[9],ymm0[10],mem[11],ymm0[12,13],mem[14],ymm0[15] 7574; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm3 7575; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5,6],xmm3[7] 7576; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 7577; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm14[1],ymm11[2,3],ymm14[4],ymm11[5],ymm14[6],ymm11[7,8],ymm14[9],ymm11[10,11],ymm14[12],ymm11[13],ymm14[14],ymm11[15] 7578; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm3 7579; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7580; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm3, %ymm3 7581; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload 7582; AVX2-FCP-NEXT: # ymm0 = ymm15[0],mem[1],ymm15[2],mem[3],ymm15[4,5],mem[6],ymm15[7,8],mem[9],ymm15[10],mem[11],ymm15[12,13],mem[14],ymm15[15] 7583; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm11 7584; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3],xmm0[4,5,6],xmm11[7] 7585; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm12[0],ymm8[1],ymm12[2,3],ymm8[4],ymm12[5],ymm8[6],ymm12[7,8],ymm8[9],ymm12[10,11],ymm8[12],ymm12[13],ymm8[14],ymm12[15] 7586; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm11 7587; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm11 7588; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 7589; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm0, %ymm11, %ymm0 7590; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2],ymm7[3],ymm9[4,5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10],ymm7[11],ymm9[12,13],ymm7[14],ymm9[15] 7591; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 7592; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3],xmm11[4,5,6],xmm12[7] 7593; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm11, %xmm4 7594; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm10[0],ymm5[1],ymm10[2,3],ymm5[4],ymm10[5],ymm5[6],ymm10[7,8],ymm5[9],ymm10[10,11],ymm5[12],ymm10[13],ymm5[14],ymm10[15] 7595; AVX2-FCP-NEXT: vpermd %ymm11, %ymm2, %ymm2 7596; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 7597; AVX2-FCP-NEXT: vpblendvb %ymm13, %ymm4, %ymm1, %ymm1 7598; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm12 7599; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,3,1,3,0,3,5,7] 7600; AVX2-FCP-NEXT: vpermd %ymm12, %ymm5, %ymm11 7601; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm14 = [0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27,0,0,18,19,20,21,26,27] 7602; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 7603; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7604; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] 7605; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] 7606; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7607; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm10 7608; AVX2-FCP-NEXT: vpermd %ymm10, %ymm5, %ymm11 7609; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 7610; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7611; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] 7612; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] 7613; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7614; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm9 7615; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm11 7616; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 7617; AVX2-FCP-NEXT: vmovdqu (%rsp), %ymm2 # 32-byte Reload 7618; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0,1,2,3,4],ymm11[5,6,7],ymm2[8,9,10,11,12],ymm11[13,14,15] 7619; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7] 7620; AVX2-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill 7621; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm4 7622; AVX2-FCP-NEXT: vpermd %ymm4, %ymm5, %ymm5 7623; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7624; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm5, %ymm5 7625; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7626; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm2[0,1,2,3,4],ymm5[5,6,7],ymm2[8,9,10,11,12],ymm5[13,14,15] 7627; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7] 7628; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7629; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,3,2,3,1,3,6,7] 7630; AVX2-FCP-NEXT: vpermd %ymm12, %ymm7, %ymm11 7631; AVX2-FCP-NEXT: vmovdqa %ymm12, %ymm8 7632; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7633; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm13 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] 7634; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11 7635; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm6[0,1,2,3,4],ymm11[5,6,7],ymm6[8,9,10,11,12],ymm11[13,14,15] 7636; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm11[4,5,6,7] 7637; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7638; AVX2-FCP-NEXT: vpermd %ymm10, %ymm7, %ymm6 7639; AVX2-FCP-NEXT: vmovdqa %ymm10, %ymm12 7640; AVX2-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7641; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm6, %ymm6 7642; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0,1,2,3,4],ymm6[5,6,7],ymm3[8,9,10,11,12],ymm6[13,14,15] 7643; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm6[4,5,6,7] 7644; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7645; AVX2-FCP-NEXT: vpermd %ymm9, %ymm7, %ymm3 7646; AVX2-FCP-NEXT: vmovdqa %ymm9, %ymm5 7647; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7648; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 7649; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm0[0,1,2,3,4],ymm3[5,6,7],ymm0[8,9,10,11,12],ymm3[13,14,15] 7650; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7] 7651; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7652; AVX2-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm0 7653; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm0, %ymm0 7654; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7],ymm1[8,9,10,11,12],ymm0[13,14,15] 7655; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7656; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7657; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 7658; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7659; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15] 7660; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 7661; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 7662; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u] 7663; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm6 7664; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7665; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload 7666; AVX2-FCP-NEXT: # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] 7667; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,0,0,5,7,2,4] 7668; AVX2-FCP-NEXT: vpermd %ymm3, %ymm1, %ymm7 7669; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 7670; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 7671; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7] 7672; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0] 7673; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] 7674; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm11 7675; AVX2-FCP-NEXT: vmovdqa %ymm14, %ymm8 7676; AVX2-FCP-NEXT: vpshufb %ymm14, %ymm11, %ymm11 7677; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] 7678; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] 7679; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7680; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 7681; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 7682; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0,1],ymm14[2],ymm15[3],ymm14[4],ymm15[5,6],ymm14[7],ymm15[8,9],ymm14[10],ymm15[11],ymm14[12],ymm15[13,14],ymm14[15] 7683; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 7684; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] 7685; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 7686; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7687; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm11 # 32-byte Folded Reload 7688; AVX2-FCP-NEXT: # ymm11 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] 7689; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm11 7690; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm11 7691; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] 7692; AVX2-FCP-NEXT: vpermd %ymm12, %ymm6, %ymm11 7693; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm11, %ymm11 7694; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm13 7695; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] 7696; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm11[4,5,6,7] 7697; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7698; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 7699; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm7 # 32-byte Folded Reload 7700; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm12[2],mem[3],ymm12[4],mem[5,6],ymm12[7],mem[8,9],ymm12[10],mem[11],ymm12[12],mem[13,14],ymm12[15] 7701; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 7702; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] 7703; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7704; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 7705; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm2[0],ymm8[1],ymm2[2,3],ymm8[4],ymm2[5],ymm8[6],ymm2[7,8],ymm8[9],ymm2[10,11],ymm8[12],ymm2[13],ymm8[14],ymm2[15] 7706; AVX2-FCP-NEXT: vpermd %ymm11, %ymm1, %ymm11 7707; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm11, %ymm11 7708; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7 7709; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3,4,5,6,7] 7710; AVX2-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm11 7711; AVX2-FCP-NEXT: vpshufb %ymm13, %ymm11, %ymm11 7712; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0,1,2,3,4],ymm11[5,6,7],ymm7[8,9,10,11,12],ymm11[13,14,15] 7713; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm11[4,5,6,7] 7714; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7715; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 7716; AVX2-FCP-NEXT: vpblendw $107, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm7 # 32-byte Folded Reload 7717; AVX2-FCP-NEXT: # ymm7 = mem[0,1],ymm13[2],mem[3],ymm13[4],mem[5,6],ymm13[7],mem[8,9],ymm13[10],mem[11],ymm13[12],mem[13,14],ymm13[15] 7718; AVX2-FCP-NEXT: vextracti128 $1, %ymm7, %xmm11 7719; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1,2],xmm11[3,4],xmm7[5,6,7] 7720; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm0 7721; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 7722; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm7 # 32-byte Folded Reload 7723; AVX2-FCP-NEXT: # ymm7 = ymm5[0],mem[1],ymm5[2,3],mem[4],ymm5[5],mem[6],ymm5[7,8],mem[9],ymm5[10,11],mem[12],ymm5[13],mem[14],ymm5[15] 7724; AVX2-FCP-NEXT: vpermd %ymm7, %ymm1, %ymm1 7725; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 7726; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 7727; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload 7728; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27] 7729; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 7730; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7731; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7732; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15] 7733; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 7734; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] 7735; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm4 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u] 7736; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 7737; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 7738; AVX2-FCP-NEXT: vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload 7739; AVX2-FCP-NEXT: # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15] 7740; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,3,0,0,5,0,2,7] 7741; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm6 7742; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 7743; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm6 7744; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7] 7745; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0] 7746; AVX2-FCP-NEXT: # ymm6 = mem[0,1,0,1] 7747; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload 7748; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm1 = [0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25,0,0,16,17,22,23,24,25] 7749; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7 7750; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] 7751; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] 7752; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7753; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] 7754; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 7755; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] 7756; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 7757; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 7758; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm15[0],ymm10[1],ymm15[2],ymm10[3],ymm15[4,5],ymm10[6],ymm15[7,8],ymm10[9],ymm15[10],ymm10[11],ymm15[12,13],ymm10[14],ymm15[15] 7759; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7 7760; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 7761; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] 7762; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload 7763; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7 7764; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] 7765; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] 7766; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7767; AVX2-FCP-NEXT: vpblendw $214, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload 7768; AVX2-FCP-NEXT: # ymm0 = ymm12[0],mem[1,2],ymm12[3],mem[4],ymm12[5],mem[6,7],ymm12[8],mem[9,10],ymm12[11],mem[12],ymm12[13],mem[14,15] 7769; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 7770; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] 7771; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15] 7772; AVX2-FCP-NEXT: vmovdqa %ymm8, %ymm12 7773; AVX2-FCP-NEXT: vpermd %ymm7, %ymm9, %ymm7 7774; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm7, %ymm7 7775; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 7776; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm7[3,4,5,6,7] 7777; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 7778; AVX2-FCP-NEXT: vpermd %ymm8, %ymm6, %ymm7 7779; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm7, %ymm7 7780; AVX2-FCP-NEXT: vmovdqa %ymm1, %ymm2 7781; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7],ymm0[8,9,10,11,12],ymm7[13,14,15] 7782; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7] 7783; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7784; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 7785; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm10[1,2],ymm13[3],ymm10[4],ymm13[5],ymm10[6,7],ymm13[8],ymm10[9,10],ymm13[11],ymm10[12],ymm13[13],ymm10[14,15] 7786; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm7 7787; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2],xmm0[3] 7788; AVX2-FCP-NEXT: vpshufb %xmm4, %xmm0, %xmm0 7789; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 7790; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] 7791; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm1 7792; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 7793; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7] 7794; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 7795; AVX2-FCP-NEXT: vpermd %ymm14, %ymm6, %ymm1 7796; AVX2-FCP-NEXT: vpshufb %ymm2, %ymm1, %ymm1 7797; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5,6,7],ymm0[8,9,10,11,12],ymm1[13,14,15] 7798; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] 7799; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 7800; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 7801; AVX2-FCP-NEXT: # ymm0 = ymm0[0],mem[1],ymm0[2,3],mem[4],ymm0[5],mem[6],ymm0[7,8],mem[9],ymm0[10,11],mem[12],ymm0[13],mem[14],ymm0[15] 7802; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2 7803; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3,4],xmm2[5,6,7] 7804; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload 7805; AVX2-FCP-NEXT: # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15] 7806; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 7807; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm0 7808; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [1,3,0,0,6,0,3,5] 7809; AVX2-FCP-NEXT: vpermd %ymm2, %ymm5, %ymm2 7810; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7] 7811; AVX2-FCP-NEXT: # ymm7 = mem[0,1,0,1] 7812; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm2, %ymm2 7813; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2],ymm2[3,4,5,6,7] 7814; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,1,3,0,2,5,7] 7815; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload 7816; AVX2-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31,16,17,22,23,24,25,30,31] 7817; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 7818; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7] 7819; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 7820; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 7821; AVX2-FCP-NEXT: # ymm3 = mem[0],ymm3[1],mem[2,3],ymm3[4],mem[5],ymm3[6],mem[7,8],ymm3[9],mem[10,11],ymm3[12],mem[13],ymm3[14],mem[15] 7822; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm11 7823; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3,4],xmm11[5,6,7] 7824; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm11 # 32-byte Folded Reload 7825; AVX2-FCP-NEXT: # ymm11 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15] 7826; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm3, %xmm3 7827; AVX2-FCP-NEXT: vpermd %ymm11, %ymm5, %ymm11 7828; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm11, %ymm11 7829; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm11[3,4,5,6,7] 7830; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload 7831; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9 7832; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7] 7833; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm9 # 32-byte Folded Reload 7834; AVX2-FCP-NEXT: # ymm9 = ymm12[0,1],mem[2],ymm12[3],mem[4],ymm12[5,6],mem[7],ymm12[8,9],mem[10],ymm12[11],mem[12],ymm12[13,14],mem[15] 7835; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 7836; AVX2-FCP-NEXT: vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload 7837; AVX2-FCP-NEXT: # ymm11 = mem[0],ymm11[1],mem[2,3],ymm11[4],mem[5],ymm11[6],mem[7,8],ymm11[9],mem[10,11],ymm11[12],mem[13],ymm11[14],mem[15] 7838; AVX2-FCP-NEXT: vextracti128 $1, %ymm11, %xmm12 7839; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2],xmm11[3,4],xmm12[5,6,7] 7840; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm9 7841; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm9, %ymm9 7842; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm11, %xmm11 7843; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2],ymm9[3,4,5,6,7] 7844; AVX2-FCP-NEXT: vpermd %ymm8, %ymm0, %ymm8 7845; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm8 7846; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7] 7847; AVX2-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm9 # 32-byte Folded Reload 7848; AVX2-FCP-NEXT: # ymm9 = ymm10[0],mem[1],ymm10[2,3],mem[4],ymm10[5],mem[6],ymm10[7,8],mem[9],ymm10[10,11],mem[12],ymm10[13],mem[14],ymm10[15] 7849; AVX2-FCP-NEXT: vextracti128 $1, %ymm9, %xmm11 7850; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3,4],xmm11[5,6,7] 7851; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm9, %xmm6 7852; AVX2-FCP-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm9 # 32-byte Folded Reload 7853; AVX2-FCP-NEXT: # ymm9 = ymm13[0,1],mem[2],ymm13[3],mem[4],ymm13[5,6],mem[7],ymm13[8,9],mem[10],ymm13[11],mem[12],ymm13[13,14],mem[15] 7854; AVX2-FCP-NEXT: vpermd %ymm9, %ymm5, %ymm5 7855; AVX2-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm5 7856; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7] 7857; AVX2-FCP-NEXT: vpermd %ymm14, %ymm0, %ymm0 7858; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 7859; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7] 7860; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7861; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi) 7862; AVX2-FCP-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload 7863; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi) 7864; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7865; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi) 7866; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7867; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi) 7868; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7869; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx) 7870; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7871; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) 7872; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7873; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx) 7874; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7875; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx) 7876; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7877; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx) 7878; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7879; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) 7880; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7881; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) 7882; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 7883; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx) 7884; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%r8) 7885; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7886; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8) 7887; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7888; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8) 7889; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 7890; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8) 7891; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9) 7892; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r9) 7893; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%r9) 7894; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r9) 7895; AVX2-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8 7896; AVX2-FCP-NEXT: vzeroupper 7897; AVX2-FCP-NEXT: retq 7898; 7899; AVX512-LABEL: load_i16_stride5_vf64: 7900; AVX512: # %bb.0: 7901; AVX512-NEXT: subq $552, %rsp # imm = 0x228 7902; AVX512-NEXT: vmovdqa 384(%rdi), %ymm6 7903; AVX512-NEXT: vmovdqa 416(%rdi), %ymm11 7904; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] 7905; AVX512-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7906; AVX512-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7907; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7908; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] 7909; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] 7910; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 7911; AVX512-NEXT: vmovdqa 352(%rdi), %ymm8 7912; AVX512-NEXT: vmovdqa 320(%rdi), %ymm7 7913; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] 7914; AVX512-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7915; AVX512-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7916; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 7917; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] 7918; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 7919; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7920; AVX512-NEXT: vporq %ymm2, %ymm3, %ymm19 7921; AVX512-NEXT: vmovdqa 192(%rdi), %ymm15 7922; AVX512-NEXT: vmovdqa 224(%rdi), %ymm13 7923; AVX512-NEXT: vmovdqa 176(%rdi), %xmm12 7924; AVX512-NEXT: vmovdqa 160(%rdi), %xmm14 7925; AVX512-NEXT: vmovdqa (%rdi), %ymm4 7926; AVX512-NEXT: vmovdqa 32(%rdi), %ymm5 7927; AVX512-NEXT: vmovdqa 64(%rdi), %ymm10 7928; AVX512-NEXT: vmovdqa 96(%rdi), %ymm9 7929; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15] 7930; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 7931; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] 7932; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 7933; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 7934; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 7935; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 7936; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 7937; AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0 7938; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7939; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15] 7940; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 7941; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 7942; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] 7943; AVX512-NEXT: vpshufb %ymm0, %ymm1, %ymm2 7944; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] 7945; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm3 7946; AVX512-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] 7947; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 7948; AVX512-NEXT: vpshufb %ymm1, %ymm3, %ymm3 7949; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2 7950; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 7951; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] 7952; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7953; AVX512-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill 7954; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 7955; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 7956; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm0 7957; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 7958; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7959; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 7960; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 7961; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] 7962; AVX512-NEXT: vpshufb %ymm1, %ymm2, %ymm1 7963; AVX512-NEXT: vporq %ymm0, %ymm1, %ymm28 7964; AVX512-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 7965; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15] 7966; AVX512-NEXT: vmovdqa64 %ymm15, %ymm18 7967; AVX512-NEXT: vmovdqa64 %ymm13, %ymm24 7968; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 7969; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] 7970; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] 7971; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3] 7972; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16 7973; AVX512-NEXT: vmovdqa64 %xmm14, %xmm30 7974; AVX512-NEXT: vpshufb %xmm6, %xmm1, %xmm1 7975; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] 7976; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 7977; AVX512-NEXT: vmovdqa 144(%rdi), %xmm11 7978; AVX512-NEXT: vmovdqa 128(%rdi), %xmm7 7979; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] 7980; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 7981; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 7982; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 7983; AVX512-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] 7984; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] 7985; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 7986; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 7987; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 7988; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 7989; AVX512-NEXT: vpshufb %ymm6, %ymm1, %ymm1 7990; AVX512-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] 7991; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 7992; AVX512-NEXT: vmovdqa 256(%rdi), %ymm12 7993; AVX512-NEXT: vmovdqa 288(%rdi), %ymm15 7994; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] 7995; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm3 7996; AVX512-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 7997; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) 7998; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 7999; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8000; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm2 8001; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] 8002; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 8003; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8004; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8005; AVX512-NEXT: vmovdqa 464(%rdi), %xmm8 8006; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] 8007; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 8008; AVX512-NEXT: vmovdqa 448(%rdi), %xmm3 8009; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 8010; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 8011; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8012; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8013; AVX512-NEXT: vmovdqa %xmm11, %xmm6 8014; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] 8015; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 8016; AVX512-NEXT: vmovdqa %xmm7, %xmm9 8017; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] 8018; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 8019; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8020; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8021; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3] 8022; AVX512-NEXT: vmovdqa64 %xmm3, %xmm22 8023; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 8024; AVX512-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8025; AVX512-NEXT: vmovdqa64 %xmm1, %xmm23 8026; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 8027; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1 8028; AVX512-NEXT: vmovdqa 608(%rdi), %ymm2 8029; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 8030; AVX512-NEXT: vmovdqa64 %ymm2, %ymm20 8031; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 8032; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 8033; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 8034; AVX512-NEXT: vmovdqa 512(%rdi), %ymm5 8035; AVX512-NEXT: vmovdqa 544(%rdi), %ymm13 8036; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15] 8037; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 8038; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] 8039; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm21 8040; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] 8041; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] 8042; AVX512-NEXT: vmovdqa 480(%rdi), %xmm7 8043; AVX512-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3] 8044; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] 8045; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 8046; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] 8047; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8048; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 8049; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] 8050; AVX512-NEXT: vpshufb %xmm2, %xmm11, %xmm11 8051; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 8052; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] 8053; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 8054; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19)) 8055; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 8056; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8057; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] 8058; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10 8059; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] 8060; AVX512-NEXT: vpshufb %xmm2, %xmm0, %xmm0 8061; AVX512-NEXT: vmovdqa64 %ymm18, %ymm3 8062; AVX512-NEXT: vmovdqa64 %ymm24, %ymm1 8063; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] 8064; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10 8065; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7] 8066; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8067; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3] 8068; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7] 8069; AVX512-NEXT: vmovdqa64 %xmm30, %xmm24 8070; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3] 8071; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] 8072; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 8073; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] 8074; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8075; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 8076; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3] 8077; AVX512-NEXT: vmovdqa64 %xmm9, %xmm25 8078; AVX512-NEXT: vmovdqa64 %xmm23, %xmm4 8079; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 8080; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 8081; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload 8082; AVX512-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) 8083; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 8084; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8085; AVX512-NEXT: vmovdqa64 %xmm22, %xmm4 8086; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3] 8087; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 8088; AVX512-NEXT: vpshufb %xmm0, %xmm2, %xmm2 8089; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 8090; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] 8091; AVX512-NEXT: vmovdqa64 %ymm13, %ymm26 8092; AVX512-NEXT: vmovdqa64 %ymm5, %ymm22 8093; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 8094; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7] 8095; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3] 8096; AVX512-NEXT: vmovdqa64 %xmm7, %xmm30 8097; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] 8098; AVX512-NEXT: vpsrlq $48, %xmm21, %xmm13 8099; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] 8100; AVX512-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] 8101; AVX512-NEXT: vpshufb %ymm13, %ymm10, %ymm10 8102; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] 8103; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 8104; AVX512-NEXT: vmovdqa64 %ymm20, %ymm9 8105; AVX512-NEXT: vmovdqa64 %ymm17, %ymm7 8106; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] 8107; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm14 8108; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] 8109; AVX512-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] 8110; AVX512-NEXT: vpshufb %xmm14, %xmm11, %xmm11 8111; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 8112; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] 8113; AVX512-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload 8114; AVX512-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) 8115; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 8116; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8117; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] 8118; AVX512-NEXT: vmovdqa64 %ymm1, %ymm17 8119; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10 8120; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7] 8121; AVX512-NEXT: vpshufb %ymm13, %ymm2, %ymm2 8122; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3] 8123; AVX512-NEXT: vmovdqa64 %xmm24, %xmm20 8124; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7] 8125; AVX512-NEXT: vmovdqa64 %xmm16, %xmm1 8126; AVX512-NEXT: vpsrlq $48, %xmm16, %xmm11 8127; AVX512-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] 8128; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] 8129; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] 8130; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15] 8131; AVX512-NEXT: vmovdqa64 %ymm12, %ymm19 8132; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 8133; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] 8134; AVX512-NEXT: vpshufb %xmm14, %xmm10, %xmm10 8135; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 8136; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] 8137; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5 8138; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3] 8139; AVX512-NEXT: vmovdqa64 %xmm6, %xmm23 8140; AVX512-NEXT: vpshufb %xmm0, %xmm10, %xmm0 8141; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8142; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28)) 8143; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 8144; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8145; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 8146; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload 8147; AVX512-NEXT: vmovdqa64 %ymm18, %ymm0 8148; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15] 8149; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 8150; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] 8151; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload 8152; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload 8153; AVX512-NEXT: vmovdqa64 %ymm24, %ymm0 8154; AVX512-NEXT: vmovdqa64 %ymm16, %ymm5 8155; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15] 8156; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm10 8157; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] 8158; AVX512-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 8159; AVX512-NEXT: vpshufb %ymm0, %ymm2, %ymm2 8160; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] 8161; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] 8162; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3] 8163; AVX512-NEXT: vmovdqa64 %xmm4, %xmm29 8164; AVX512-NEXT: vmovdqa64 %xmm8, %xmm28 8165; AVX512-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 8166; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 8167; AVX512-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2)) 8168; AVX512-NEXT: vmovdqa64 %ymm26, %ymm8 8169; AVX512-NEXT: vmovdqa64 %ymm22, %ymm4 8170; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] 8171; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm14 8172; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] 8173; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] 8174; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8175; AVX512-NEXT: vmovdqa64 %xmm30, %xmm6 8176; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3] 8177; AVX512-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] 8178; AVX512-NEXT: vmovdqa64 %xmm21, %xmm5 8179; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3] 8180; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7] 8181; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] 8182; AVX512-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] 8183; AVX512-NEXT: vmovdqa64 %ymm7, %ymm21 8184; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm10 8185; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] 8186; AVX512-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] 8187; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm10 8188; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 8189; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] 8190; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 8191; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8192; AVX512-NEXT: vmovdqa64 %ymm17, %ymm2 8193; AVX512-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] 8194; AVX512-NEXT: vmovdqa64 %ymm3, %ymm22 8195; AVX512-NEXT: vmovdqa64 %ymm17, %ymm26 8196; AVX512-NEXT: vextracti128 $1, %ymm2, %xmm10 8197; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7] 8198; AVX512-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8199; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3] 8200; AVX512-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] 8201; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] 8202; AVX512-NEXT: vmovdqa64 %xmm1, %xmm17 8203; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] 8204; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] 8205; AVX512-NEXT: vmovdqa64 %ymm19, %ymm1 8206; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] 8207; AVX512-NEXT: vmovdqa64 %ymm15, %ymm31 8208; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm13 8209; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7] 8210; AVX512-NEXT: vpshufb %xmm11, %xmm10, %xmm10 8211; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 8212; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] 8213; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8214; AVX512-NEXT: vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload 8215; AVX512-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] 8216; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 8217; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] 8218; AVX512-NEXT: vpshufb %ymm0, %ymm10, %ymm0 8219; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 8220; AVX512-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload 8221; AVX512-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] 8222; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 8223; AVX512-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] 8224; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] 8225; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] 8226; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1 8227; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 8228; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] 8229; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 8230; AVX512-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 8231; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0)) 8232; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 8233; AVX512-NEXT: vmovdqa64 %ymm18, %ymm13 8234; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] 8235; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] 8236; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7] 8237; AVX512-NEXT: vmovdqa64 %ymm16, %ymm14 8238; AVX512-NEXT: vmovdqa64 %ymm24, %ymm15 8239; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] 8240; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 8241; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] 8242; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] 8243; AVX512-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 8244; AVX512-NEXT: vpshufb %ymm1, %ymm0, %ymm0 8245; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] 8246; AVX512-NEXT: vmovdqa %ymm4, %ymm3 8247; AVX512-NEXT: vmovdqa %ymm8, %ymm4 8248; AVX512-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15] 8249; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm11 8250; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7] 8251; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] 8252; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3] 8253; AVX512-NEXT: vmovdqa64 %xmm30, %xmm7 8254; AVX512-NEXT: vmovdqa %xmm5, %xmm6 8255; AVX512-NEXT: vpshufb %xmm1, %xmm11, %xmm11 8256; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] 8257; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 8258; AVX512-NEXT: vmovdqa64 %xmm29, %xmm1 8259; AVX512-NEXT: vmovdqa64 %xmm28, %xmm2 8260; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3] 8261; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 8262; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8263; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 8264; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0)) 8265; AVX512-NEXT: vmovdqa %ymm9, %ymm2 8266; AVX512-NEXT: vmovdqa64 %ymm21, %ymm10 8267; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] 8268; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm9 8269; AVX512-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7] 8270; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 8271; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8272; AVX512-NEXT: vextracti64x4 $1, %zmm8, %ymm9 8273; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] 8274; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] 8275; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm25 8276; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] 8277; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] 8278; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7] 8279; AVX512-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] 8280; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm9 8281; AVX512-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] 8282; AVX512-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] 8283; AVX512-NEXT: vpshufb %ymm9, %ymm0, %ymm0 8284; AVX512-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 8285; AVX512-NEXT: vpshufb %xmm8, %xmm11, %xmm11 8286; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] 8287; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] 8288; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 8289; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] 8290; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3] 8291; AVX512-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] 8292; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm4 8293; AVX512-NEXT: vpshufb %xmm9, %xmm1, %xmm1 8294; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] 8295; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 8296; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 8297; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 8298; AVX512-NEXT: movb $7, %al 8299; AVX512-NEXT: kmovw %eax, %k1 8300; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} 8301; AVX512-NEXT: vextracti64x4 $1, %zmm1, %ymm4 8302; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] 8303; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm5 8304; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7] 8305; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 8306; AVX512-NEXT: vpshufb %xmm0, %xmm5, %xmm5 8307; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 8308; AVX512-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] 8309; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 8310; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 8311; AVX512-NEXT: vmovdqa64 %ymm22, %ymm2 8312; AVX512-NEXT: vmovdqa64 %ymm26, %ymm4 8313; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] 8314; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 8315; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] 8316; AVX512-NEXT: vpshufb %ymm3, %ymm4, %ymm3 8317; AVX512-NEXT: vmovdqa64 %xmm20, %xmm2 8318; AVX512-NEXT: vmovdqa64 %xmm17, %xmm4 8319; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] 8320; AVX512-NEXT: vpshufb %xmm9, %xmm4, %xmm4 8321; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] 8322; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 8323; AVX512-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 8324; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 8325; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8326; AVX512-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload 8327; AVX512-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] 8328; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 8329; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] 8330; AVX512-NEXT: vpshufb %ymm9, %ymm4, %ymm4 8331; AVX512-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 8332; AVX512-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 8333; AVX512-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] 8334; AVX512-NEXT: vextracti128 $1, %ymm5, %xmm6 8335; AVX512-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] 8336; AVX512-NEXT: vpshufb %xmm8, %xmm5, %xmm5 8337; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] 8338; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} 8339; AVX512-NEXT: vmovdqa64 %ymm19, %ymm2 8340; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4 8341; AVX512-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] 8342; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm5 8343; AVX512-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] 8344; AVX512-NEXT: vpshufb %xmm0, %xmm4, %xmm0 8345; AVX512-NEXT: vextracti64x4 $1, %zmm3, %ymm4 8346; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8347; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] 8348; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 8349; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 8350; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 8351; AVX512-NEXT: vmovaps %zmm2, (%rsi) 8352; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 8353; AVX512-NEXT: vmovaps %zmm2, 64(%rsi) 8354; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 8355; AVX512-NEXT: vmovaps %zmm2, 64(%rdx) 8356; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 8357; AVX512-NEXT: vmovaps %zmm2, (%rdx) 8358; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 8359; AVX512-NEXT: vmovaps %zmm2, 64(%rcx) 8360; AVX512-NEXT: vmovdqa64 %zmm23, (%rcx) 8361; AVX512-NEXT: vmovdqa64 %zmm25, 64(%r8) 8362; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 8363; AVX512-NEXT: vmovaps %zmm2, (%r8) 8364; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r9) 8365; AVX512-NEXT: vmovdqa64 %zmm0, (%r9) 8366; AVX512-NEXT: addq $552, %rsp # imm = 0x228 8367; AVX512-NEXT: vzeroupper 8368; AVX512-NEXT: retq 8369; 8370; AVX512-FCP-LABEL: load_i16_stride5_vf64: 8371; AVX512-FCP: # %bb.0: 8372; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228 8373; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 8374; AVX512-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 8375; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8376; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 8377; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 8378; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %xmm2 8379; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8380; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 8381; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 8382; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 8383; AVX512-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8384; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %ymm11 8385; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8386; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 8387; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8388; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 8389; AVX512-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill 8390; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 8391; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 8392; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 8393; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 8394; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 8395; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 8396; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] 8397; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 8398; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 8399; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 8400; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 8401; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] 8402; AVX512-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 8403; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8404; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 8405; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] 8406; AVX512-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 8407; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] 8408; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 8409; AVX512-FCP-NEXT: vpor %ymm5, %ymm4, %ymm4 8410; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] 8411; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 8412; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] 8413; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 8414; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] 8415; AVX512-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 8416; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] 8417; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 8418; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] 8419; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 8420; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] 8421; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 8422; AVX512-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 8423; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] 8424; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] 8425; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 8426; AVX512-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8427; AVX512-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 8428; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] 8429; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 8430; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 8431; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) 8432; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 8433; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8434; AVX512-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 8435; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 8436; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm31 8437; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 8438; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 8439; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 8440; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 8441; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 8442; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15] 8443; AVX512-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 8444; AVX512-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 8445; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 8446; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 8447; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15] 8448; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 8449; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] 8450; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 8451; AVX512-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 8452; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 8453; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 8454; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15] 8455; AVX512-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3 8456; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 8457; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 8458; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %ymm9 8459; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15] 8460; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 8461; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 8462; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] 8463; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 8464; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 8465; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 8466; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 8467; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 8468; AVX512-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 8469; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8470; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) 8471; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 8472; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8473; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] 8474; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 8475; AVX512-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8476; AVX512-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 8477; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8478; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 8479; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] 8480; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] 8481; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] 8482; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 8483; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 8484; AVX512-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 8485; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] 8486; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 8487; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 8488; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 8489; AVX512-FCP-NEXT: vpor %ymm1, %ymm0, %ymm10 8490; AVX512-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 8491; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 8492; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8493; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] 8494; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] 8495; AVX512-FCP-NEXT: vmovdqa %ymm8, %ymm5 8496; AVX512-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 8497; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] 8498; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 8499; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] 8500; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] 8501; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] 8502; AVX512-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] 8503; AVX512-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8504; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 8505; AVX512-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8506; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] 8507; AVX512-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0 8508; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] 8509; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 8510; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] 8511; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] 8512; AVX512-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 8513; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] 8514; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 8515; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) 8516; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 8517; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 8518; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8519; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 8520; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 8521; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 8522; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 8523; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] 8524; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 8525; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] 8526; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 8527; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 8528; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] 8529; AVX512-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 8530; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 8531; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 8532; AVX512-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 8533; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8534; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload 8535; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 8536; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] 8537; AVX512-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4 8538; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 8539; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 8540; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 8541; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload 8542; AVX512-FCP-NEXT: vpsrlq $48, %xmm27, %xmm4 8543; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 8544; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] 8545; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 8546; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload 8547; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8548; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] 8549; AVX512-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 8550; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 8551; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 8552; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload 8553; AVX512-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 8554; AVX512-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8555; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) 8556; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 8557; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8558; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] 8559; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 8560; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 8561; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] 8562; AVX512-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 8563; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] 8564; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] 8565; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] 8566; AVX512-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 8567; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 8568; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3 8569; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] 8570; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 8571; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7] 8572; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] 8573; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] 8574; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 8575; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 8576; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] 8577; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 8578; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 8579; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 8580; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 8581; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] 8582; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] 8583; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 8584; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 8585; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 8586; AVX512-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 8587; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 8588; AVX512-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 8589; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 8590; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 8591; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] 8592; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 8593; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] 8594; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] 8595; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1] 8596; AVX512-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14 8597; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] 8598; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 8599; AVX512-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 8600; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0)) 8601; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 8602; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8603; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] 8604; AVX512-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 8605; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] 8606; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 8607; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm8 8608; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] 8609; AVX512-FCP-NEXT: vmovdqa %ymm11, %ymm13 8610; AVX512-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2 8611; AVX512-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 8612; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] 8613; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm14 8614; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] 8615; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 8616; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] 8617; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 8618; AVX512-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8619; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 8620; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 8621; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 8622; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] 8623; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 8624; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] 8625; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 8626; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload 8627; AVX512-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] 8628; AVX512-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 8629; AVX512-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 8630; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 8631; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 8632; AVX512-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 8633; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 8634; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) 8635; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 8636; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 8637; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 8638; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] 8639; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] 8640; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 8641; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] 8642; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 8643; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 8644; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] 8645; AVX512-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 8646; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] 8647; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 8648; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] 8649; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 8650; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] 8651; AVX512-FCP-NEXT: # ymm4 = mem[0,1,0,1] 8652; AVX512-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 8653; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] 8654; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 8655; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 8656; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] 8657; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 8658; AVX512-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 8659; AVX512-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 8660; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] 8661; AVX512-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 8662; AVX512-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 8663; AVX512-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 8664; AVX512-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 8665; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] 8666; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 8667; AVX512-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 8668; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 8669; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 8670; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] 8671; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] 8672; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] 8673; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 8674; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] 8675; AVX512-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 8676; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] 8677; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 8678; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] 8679; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] 8680; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] 8681; AVX512-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 8682; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] 8683; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 8684; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 8685; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 8686; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 8687; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 8688; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] 8689; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 8690; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 8691; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 8692; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] 8693; AVX512-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 8694; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 8695; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] 8696; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 8697; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 8698; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 8699; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] 8700; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 8701; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] 8702; AVX512-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] 8703; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3)) 8704; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 8705; AVX512-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 8706; AVX512-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] 8707; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 8708; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] 8709; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 8710; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 8711; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 8712; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm25, %ymm9 8713; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] 8714; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 8715; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 8716; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] 8717; AVX512-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 8718; AVX512-FCP-NEXT: vmovdqa64 %ymm11, %ymm25 8719; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 8720; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] 8721; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 8722; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 8723; AVX512-FCP-NEXT: vmovdqa64 %ymm16, %ymm12 8724; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] 8725; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 8726; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 8727; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] 8728; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) 8729; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 8730; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 8731; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] 8732; AVX512-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 8733; AVX512-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 8734; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm5 8735; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 8736; AVX512-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4 8737; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 8738; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] 8739; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 8740; AVX512-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 8741; AVX512-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 8742; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 8743; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 8744; AVX512-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload 8745; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 8746; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] 8747; AVX512-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 8748; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] 8749; AVX512-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 8750; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm24, %ymm3 8751; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 8752; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] 8753; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 8754; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 8755; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] 8756; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] 8757; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 8758; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] 8759; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 8760; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 8761; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] 8762; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 8763; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] 8764; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 8765; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 8766; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] 8767; AVX512-FCP-NEXT: movb $7, %al 8768; AVX512-FCP-NEXT: kmovw %eax, %k1 8769; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} 8770; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3 8771; AVX512-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 8772; AVX512-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 8773; AVX512-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15] 8774; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 8775; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] 8776; AVX512-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 8777; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 8778; AVX512-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 8779; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] 8780; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 8781; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 8782; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 8783; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 8784; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] 8785; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 8786; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] 8787; AVX512-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 8788; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15] 8789; AVX512-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4 8790; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 8791; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] 8792; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} 8793; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] 8794; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 8795; AVX512-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] 8796; AVX512-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 8797; AVX512-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm4 8798; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 8799; AVX512-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] 8800; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 8801; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 8802; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8803; AVX512-FCP-NEXT: vmovaps %zmm3, (%rsi) 8804; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8805; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rsi) 8806; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8807; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%rdx) 8808; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8809; AVX512-FCP-NEXT: vmovaps %zmm3, (%rdx) 8810; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) 8811; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 8812; AVX512-FCP-NEXT: vmovaps %zmm3, (%rcx) 8813; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) 8814; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%r8) 8815; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 8816; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%r9) 8817; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228 8818; AVX512-FCP-NEXT: vzeroupper 8819; AVX512-FCP-NEXT: retq 8820; 8821; AVX512DQ-LABEL: load_i16_stride5_vf64: 8822; AVX512DQ: # %bb.0: 8823; AVX512DQ-NEXT: subq $552, %rsp # imm = 0x228 8824; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm6 8825; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm11 8826; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm11[1,2],ymm6[3],ymm11[4],ymm6[5],ymm11[6,7],ymm6[8],ymm11[9,10],ymm6[11],ymm11[12],ymm6[13],ymm11[14,15] 8827; AVX512DQ-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8828; AVX512DQ-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8829; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 8830; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6],ymm1[7] 8831; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,7,16,17,26,27,20,21,30,31,24,25,128,128,128,128,128,128] 8832; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 8833; AVX512DQ-NEXT: vmovdqa 352(%rdi), %ymm8 8834; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm7 8835; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm8[1],ymm7[2,3],ymm8[4],ymm7[5],ymm8[6],ymm7[7,8],ymm8[9],ymm7[10,11],ymm8[12],ymm7[13],ymm8[14],ymm7[15] 8836; AVX512DQ-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8837; AVX512DQ-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8838; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 8839; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7] 8840; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 8841; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 8842; AVX512DQ-NEXT: vporq %ymm2, %ymm3, %ymm19 8843; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm15 8844; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm13 8845; AVX512DQ-NEXT: vmovdqa 176(%rdi), %xmm12 8846; AVX512DQ-NEXT: vmovdqa 160(%rdi), %xmm14 8847; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm4 8848; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm5 8849; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm10 8850; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm9 8851; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm10[0],ymm9[1,2],ymm10[3],ymm9[4],ymm10[5],ymm9[6,7],ymm10[8],ymm9[9,10],ymm10[11],ymm9[12],ymm10[13],ymm9[14,15] 8852; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 8853; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6],ymm3[7] 8854; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 8855; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15] 8856; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 8857; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 8858; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 8859; AVX512DQ-NEXT: vpor %ymm0, %ymm1, %ymm0 8860; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8861; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm11[0],ymm6[1],ymm11[2,3],ymm6[4],ymm11[5],ymm6[6],ymm11[7,8],ymm6[9],ymm11[10,11],ymm6[12],ymm11[13],ymm6[14],ymm11[15] 8862; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1] 8863; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7] 8864; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,8,9,18,19,28,29,22,23,16,17,26,27,128,128,128,128,128,128] 8865; AVX512DQ-NEXT: vpshufb %ymm0, %ymm1, %ymm2 8866; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15] 8867; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm3 8868; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7] 8869; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 8870; AVX512DQ-NEXT: vpshufb %ymm1, %ymm3, %ymm3 8871; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2 8872; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8873; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13],ymm10[14],ymm9[15] 8874; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8875; AVX512DQ-NEXT: vmovdqu %ymm9, (%rsp) # 32-byte Spill 8876; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1] 8877; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7] 8878; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm0 8879; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15] 8880; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8881; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 8882; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 8883; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5,6],xmm3[7] 8884; AVX512DQ-NEXT: vpshufb %ymm1, %ymm2, %ymm1 8885; AVX512DQ-NEXT: vporq %ymm0, %ymm1, %ymm28 8886; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm6 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 8887; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0,1],ymm15[2],ymm13[3],ymm15[4],ymm13[5,6],ymm15[7],ymm13[8,9],ymm15[10],ymm13[11],ymm15[12],ymm13[13,14],ymm15[15] 8888; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm18 8889; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm24 8890; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 8891; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] 8892; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] 8893; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0],xmm14[1],xmm12[2,3] 8894; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16 8895; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm30 8896; AVX512DQ-NEXT: vpshufb %xmm6, %xmm1, %xmm1 8897; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4,5,6,7] 8898; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] 8899; AVX512DQ-NEXT: vmovdqa 144(%rdi), %xmm11 8900; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm7 8901; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm7[2],xmm11[3] 8902; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 8903; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 8904; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8905; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1],ymm9[2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7,8],ymm10[9],ymm9[10],ymm10[11],ymm9[12,13],ymm10[14],ymm9[15] 8906; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,3,0,1] 8907; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7] 8908; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 8909; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 8910; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2],xmm2[3] 8911; AVX512DQ-NEXT: vpshufb %ymm6, %ymm1, %ymm1 8912; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] 8913; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 8914; AVX512DQ-NEXT: vmovdqa 256(%rdi), %ymm12 8915; AVX512DQ-NEXT: vmovdqa 288(%rdi), %ymm15 8916; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0],ymm15[1],ymm12[2,3],ymm15[4],ymm12[5],ymm15[6],ymm12[7,8],ymm15[9],ymm12[10,11],ymm15[12],ymm12[13],ymm15[14],ymm12[15] 8917; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm3 8918; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4,5],xmm3[6,7] 8919; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (mem & (zmm1 ^ zmm0)) 8920; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm2[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 8921; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8922; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm2 8923; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm0[1,2,3,4,5,6,7],ymm2[8],ymm0[9,10,11,12,13,14,15] 8924; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 8925; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 8926; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8927; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm8 8928; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3] 8929; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 8930; AVX512DQ-NEXT: vmovdqa 448(%rdi), %xmm3 8931; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[0,2,2,3] 8932; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 8933; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8934; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8935; AVX512DQ-NEXT: vmovdqa %xmm11, %xmm6 8936; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3] 8937; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7] 8938; AVX512DQ-NEXT: vmovdqa %xmm7, %xmm9 8939; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3] 8940; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7] 8941; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 8942; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 8943; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0],xmm8[1],xmm3[2,3] 8944; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22 8945; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [6,7,2,3,4,5,6,7,6,7,2,3,12,13,6,7] 8946; AVX512DQ-NEXT: vpshufb %xmm1, %xmm0, %xmm0 8947; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm23 8948; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10 8949; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1 8950; AVX512DQ-NEXT: vmovdqa 608(%rdi), %ymm2 8951; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15] 8952; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm20 8953; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 8954; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 8955; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7] 8956; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm5 8957; AVX512DQ-NEXT: vmovdqa 544(%rdi), %ymm13 8958; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm13[0],ymm5[1],ymm13[2,3],ymm5[4],ymm13[5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10,11],ymm5[12],ymm13[13],ymm5[14],ymm13[15] 8959; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm1 8960; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5,6,7] 8961; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm21 8962; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm21[3,1,2,3] 8963; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[2,1,2,3,4,5,6,7] 8964; AVX512DQ-NEXT: vmovdqa 480(%rdi), %xmm7 8965; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm14 = xmm7[0,2,2,3] 8966; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7] 8967; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1] 8968; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23] 8969; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8970; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7] 8971; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] 8972; AVX512DQ-NEXT: vpshufb %xmm2, %xmm11, %xmm11 8973; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 8974; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm11[5,6,7] 8975; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm27 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 8976; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm19)) 8977; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm10, %zmm0 8978; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 8979; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm15[0,1],ymm12[2],ymm15[3],ymm12[4],ymm15[5,6],ymm12[7],ymm15[8,9],ymm12[10],ymm15[11],ymm12[12],ymm15[13,14],ymm12[15] 8980; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10 8981; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] 8982; AVX512DQ-NEXT: vpshufb %xmm2, %xmm0, %xmm0 8983; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm3 8984; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm1 8985; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5],ymm3[6],ymm1[7,8],ymm3[9],ymm1[10,11],ymm3[12],ymm1[13],ymm3[14],ymm1[15] 8986; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10 8987; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0],ymm2[1,2],ymm10[3],ymm2[4,5,6,7] 8988; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 8989; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm16[3,1,2,3] 8990; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[2,1,2,3,4,5,6,7] 8991; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm24 8992; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,2,2,3] 8993; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,3,2,3,4,5,6,7] 8994; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1] 8995; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1],ymm2[2,3,4,5,6,7] 8996; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 8997; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 8998; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0],xmm6[1],xmm9[2,3] 8999; AVX512DQ-NEXT: vmovdqa64 %xmm9, %xmm25 9000; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm4 9001; AVX512DQ-NEXT: vpshufb %xmm4, %xmm2, %xmm2 9002; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 9003; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload 9004; AVX512DQ-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) 9005; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 9006; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9007; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm4 9008; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm8[2],xmm4[3] 9009; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7,8,9,4,5,14,15,8,9] 9010; AVX512DQ-NEXT: vpshufb %xmm0, %xmm2, %xmm2 9011; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 9012; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm5[0],ymm13[1],ymm5[2,3],ymm13[4],ymm5[5],ymm13[6],ymm5[7,8],ymm13[9],ymm5[10,11],ymm13[12],ymm5[13],ymm13[14],ymm5[15] 9013; AVX512DQ-NEXT: vmovdqa64 %ymm13, %ymm26 9014; AVX512DQ-NEXT: vmovdqa64 %ymm5, %ymm22 9015; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 9016; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0],ymm11[1],ymm10[2],ymm11[3],ymm10[4,5,6,7] 9017; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,3,2,3] 9018; AVX512DQ-NEXT: vmovdqa64 %xmm7, %xmm30 9019; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,2,2,3,4,5,6,7] 9020; AVX512DQ-NEXT: vpsrlq $48, %xmm21, %xmm13 9021; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm13[0],xmm11[1],xmm13[1] 9022; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm13 = [2,3,12,13,0,1,0,1,10,11,4,5,14,15,8,9,18,19,28,29,16,17,16,17,26,27,20,21,30,31,24,25] 9023; AVX512DQ-NEXT: vpshufb %ymm13, %ymm10, %ymm10 9024; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] 9025; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9026; AVX512DQ-NEXT: vmovdqa64 %ymm20, %ymm9 9027; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm7 9028; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm7[0],ymm9[1,2],ymm7[3],ymm9[4],ymm7[5],ymm9[6,7],ymm7[8],ymm9[9,10],ymm7[11],ymm9[12],ymm7[13],ymm9[14,15] 9029; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm14 9030; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm14[0],xmm11[1],xmm14[2],xmm11[3] 9031; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm14 = [0,1,2,3,6,7,0,1,10,11,4,5,14,15,8,9] 9032; AVX512DQ-NEXT: vpshufb %xmm14, %xmm11, %xmm11 9033; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 9034; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5,6,7] 9035; AVX512DQ-NEXT: vpternlogq $184, {{[-0-9]+}}(%r{{[sb]}}p), %zmm27, %zmm2 # 64-byte Folded Reload 9036; AVX512DQ-NEXT: # zmm2 = zmm2 ^ (zmm27 & (zmm2 ^ mem)) 9037; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm2, %zmm2 9038; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9039; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15] 9040; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm17 9041; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10 9042; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5,6,7] 9043; AVX512DQ-NEXT: vpshufb %ymm13, %ymm2, %ymm2 9044; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[0,3,2,3] 9045; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm20 9046; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,2,2,3,4,5,6,7] 9047; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm1 9048; AVX512DQ-NEXT: vpsrlq $48, %xmm16, %xmm11 9049; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] 9050; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] 9051; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] 9052; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm12[0],ymm15[1,2],ymm12[3],ymm15[4],ymm12[5],ymm15[6,7],ymm12[8],ymm15[9,10],ymm12[11],ymm15[12],ymm12[13],ymm15[14,15] 9053; AVX512DQ-NEXT: vmovdqa64 %ymm12, %ymm19 9054; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 9055; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] 9056; AVX512DQ-NEXT: vpshufb %xmm14, %xmm10, %xmm10 9057; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 9058; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] 9059; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5 9060; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1],xmm6[2],xmm5[3] 9061; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm23 9062; AVX512DQ-NEXT: vpshufb %xmm0, %xmm10, %xmm0 9063; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9064; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm27 & (zmm0 ^ zmm28)) 9065; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 9066; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9067; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 9068; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm18 # 32-byte Reload 9069; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm0 9070; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5],ymm12[6],ymm0[7,8],ymm12[9],ymm0[10,11],ymm12[12],ymm0[13],ymm12[14],ymm0[15] 9071; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1] 9072; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7] 9073; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Reload 9074; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload 9075; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm0 9076; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm5 9077; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm5[2],ymm0[3],ymm5[4],ymm0[5,6],ymm5[7],ymm0[8,9],ymm5[10],ymm0[11],ymm5[12],ymm0[13,14],ymm5[15] 9078; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm10 9079; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm0[0,1,2],xmm10[3,4],xmm0[5,6,7] 9080; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm0 = [4,5,14,15,8,9,2,3,12,13,10,11,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27] 9081; AVX512DQ-NEXT: vpshufb %ymm0, %ymm2, %ymm2 9082; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] 9083; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2],ymm2[3,4,5,6,7] 9084; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm8[0],xmm4[1],xmm8[2,3] 9085; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm29 9086; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm28 9087; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 9088; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 9089; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm13 = zmm13 ^ (zmm27 & (zmm13 ^ zmm2)) 9090; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm8 9091; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm4 9092; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm4[1],ymm8[2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10],ymm4[11],ymm8[12,13],ymm4[14],ymm8[15] 9093; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm14 9094; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4,5,6,7] 9095; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,14,15,12,13,2,3,12,13,6,7,0,1,10,11,20,21,30,31,28,29,18,19,28,29,22,23,16,17,26,27] 9096; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 9097; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm6 9098; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm30[0,1,1,3] 9099; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,7,6,7] 9100; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm5 9101; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm21[2],xmm11[3],xmm21[3] 9102; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm2[3,4,5,6,7] 9103; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm11[0,1,2,3],ymm2[4,5,6,7] 9104; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm11 = ymm9[0],ymm7[1],ymm9[2,3],ymm7[4],ymm9[5],ymm7[6],ymm9[7,8],ymm7[9],ymm9[10,11],ymm7[12],ymm9[13],ymm7[14],ymm9[15] 9105; AVX512DQ-NEXT: vmovdqa64 %ymm7, %ymm21 9106; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm10 9107; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] 9108; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm11 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] 9109; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm10 9110; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 9111; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] 9112; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm13, %zmm2 9113; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9114; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm2 9115; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15] 9116; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm22 9117; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm26 9118; AVX512DQ-NEXT: vextracti128 $1, %ymm2, %xmm10 9119; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[2,3,4,5,6,7] 9120; AVX512DQ-NEXT: vpshufb %ymm14, %ymm2, %ymm2 9121; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm20[0,1,1,3] 9122; AVX512DQ-NEXT: vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,7,6,7] 9123; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3] 9124; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm17 9125; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm2[3,4,5,6,7] 9126; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm10[0,1,2,3],ymm2[4,5,6,7] 9127; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm1 9128; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm15[0],ymm1[1],ymm15[2,3],ymm1[4],ymm15[5],ymm1[6],ymm15[7,8],ymm1[9],ymm15[10,11],ymm1[12],ymm15[13],ymm1[14],ymm15[15] 9129; AVX512DQ-NEXT: vmovdqa64 %ymm15, %ymm31 9130; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm13 9131; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm13[0,1,2],xmm10[3,4],xmm13[5,6,7] 9132; AVX512DQ-NEXT: vpshufb %xmm11, %xmm10, %xmm10 9133; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 9134; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm10[5,6,7] 9135; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9136; AVX512DQ-NEXT: vpblendw $82, (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload 9137; AVX512DQ-NEXT: # ymm10 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15] 9138; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1] 9139; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6],ymm10[7] 9140; AVX512DQ-NEXT: vpshufb %ymm0, %ymm10, %ymm0 9141; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 9142; AVX512DQ-NEXT: vpblendw $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload 9143; AVX512DQ-NEXT: # ymm10 = ymm1[0,1],mem[2],ymm1[3],mem[4],ymm1[5,6],mem[7],ymm1[8,9],mem[10],ymm1[11],mem[12],ymm1[13,14],mem[15] 9144; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 9145; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1,2],xmm11[3,4],xmm10[5,6,7] 9146; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] 9147; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] 9148; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1 9149; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3 9150; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm3[0],xmm1[1],xmm3[2,3] 9151; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1,6,7,4,5,6,7,8,9,6,7,0,1,10,11] 9152; AVX512DQ-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 9153; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 ^ (zmm27 & (zmm10 ^ zmm0)) 9154; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm10, %zmm23 9155; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm13 9156; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0],ymm13[1],ymm12[2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7,8],ymm13[9],ymm12[10],ymm13[11],ymm12[12,13],ymm13[14],ymm12[15] 9157; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm0[2,3,0,1] 9158; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4],ymm0[5],ymm10[6],ymm0[7] 9159; AVX512DQ-NEXT: vmovdqa64 %ymm16, %ymm14 9160; AVX512DQ-NEXT: vmovdqa64 %ymm24, %ymm15 9161; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm14[0],ymm15[1,2],ymm14[3],ymm15[4],ymm14[5],ymm15[6,7],ymm14[8],ymm15[9,10],ymm14[11],ymm15[12],ymm14[13],ymm15[14,15] 9162; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 9163; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2],xmm10[3] 9164; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] 9165; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm1 = [6,7,0,1,10,11,4,5,14,15,14,15,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29] 9166; AVX512DQ-NEXT: vpshufb %ymm1, %ymm0, %ymm0 9167; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3,4,5,6,7] 9168; AVX512DQ-NEXT: vmovdqa %ymm4, %ymm3 9169; AVX512DQ-NEXT: vmovdqa %ymm8, %ymm4 9170; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm10 = ymm8[0,1],ymm3[2],ymm8[3],ymm3[4],ymm8[5,6],ymm3[7],ymm8[8,9],ymm3[10],ymm8[11],ymm3[12],ymm8[13,14],ymm3[15] 9171; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm11 9172; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1],ymm11[2],ymm10[3,4,5,6,7] 9173; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[6,7,6,7,8,9,4,5,14,15,8,9,2,3,12,13,22,23,22,23,24,25,20,21,30,31,24,25,18,19,28,29] 9174; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm5[0],xmm6[1],xmm5[2,3] 9175; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm7 9176; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm6 9177; AVX512DQ-NEXT: vpshufb %xmm1, %xmm11, %xmm11 9178; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1,2],xmm10[3,4,5,6,7] 9179; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7] 9180; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm1 9181; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm2 9182; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm2[0,1],xmm1[2],xmm2[3] 9183; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,6,7,8,9,8,9,2,3,12,13] 9184; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 9185; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm8 9186; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 ^ (mem & (zmm8 ^ zmm0)) 9187; AVX512DQ-NEXT: vmovdqa %ymm9, %ymm2 9188; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm10 9189; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5],ymm9[6],ymm10[7,8],ymm9[9],ymm10[10,11],ymm9[12],ymm10[13],ymm9[14],ymm10[15] 9190; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm9 9191; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm9[1,2,3],xmm0[4,5],xmm9[6,7] 9192; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 9193; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9194; AVX512DQ-NEXT: vextracti64x4 $1, %zmm8, %ymm9 9195; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1,2,3,4,5,6,7],ymm9[8],ymm0[9,10,11,12,13,14,15] 9196; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7] 9197; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm8, %zmm25 9198; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm12[0,1],ymm13[2],ymm12[3],ymm13[4],ymm12[5,6],ymm13[7],ymm12[8,9],ymm13[10],ymm12[11],ymm13[12],ymm12[13,14],ymm13[15] 9199; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm0[2,3,0,1] 9200; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4],ymm0[5,6],ymm8[7] 9201; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm8 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5],ymm14[6],ymm15[7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13],ymm14[14],ymm15[15] 9202; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm9 9203; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2],xmm8[3,4],xmm9[5,6,7] 9204; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm9 = [8,9,2,3,12,13,6,7,4,5,6,7,4,5,14,15,24,25,18,19,28,29,22,23,20,21,22,23,20,21,30,31] 9205; AVX512DQ-NEXT: vpshufb %ymm9, %ymm0, %ymm0 9206; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm8 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 9207; AVX512DQ-NEXT: vpshufb %xmm8, %xmm11, %xmm11 9208; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7] 9209; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm3[0],ymm4[1,2],ymm3[3],ymm4[4],ymm3[5],ymm4[6,7],ymm3[8],ymm4[9,10],ymm3[11],ymm4[12],ymm3[13],ymm4[14,15] 9210; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 9211; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] 9212; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm7[2],xmm6[3] 9213; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm3 = [8,9,14,15,4,5,6,7,0,1,10,11,4,5,14,15,24,25,30,31,20,21,22,23,16,17,26,27,20,21,30,31] 9214; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm4 9215; AVX512DQ-NEXT: vpshufb %xmm9, %xmm1, %xmm1 9216; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] 9217; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7] 9218; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 9219; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm4, %zmm1 9220; AVX512DQ-NEXT: movb $7, %al 9221; AVX512DQ-NEXT: kmovw %eax, %k1 9222; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm0, %zmm1 {%k1} 9223; AVX512DQ-NEXT: vextracti64x4 $1, %zmm1, %ymm4 9224; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1],ymm2[2],ymm10[3],ymm2[4,5],ymm10[6],ymm2[7,8],ymm10[9],ymm2[10],ymm10[11],ymm2[12,13],ymm10[14],ymm2[15] 9225; AVX512DQ-NEXT: vextracti128 $1, %ymm0, %xmm5 9226; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6],xmm5[7] 9227; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm0 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 9228; AVX512DQ-NEXT: vpshufb %xmm0, %xmm5, %xmm5 9229; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 9230; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm5 = ymm4[0],ymm5[1,2,3,4,5,6,7],ymm4[8],ymm5[9,10,11,12,13,14,15] 9231; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7] 9232; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1 9233; AVX512DQ-NEXT: vmovdqa64 %ymm22, %ymm2 9234; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm4 9235; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4],ymm2[5],ymm4[6,7],ymm2[8],ymm4[9,10],ymm2[11],ymm4[12],ymm2[13],ymm4[14,15] 9236; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 9237; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2],ymm4[3,4,5,6,7] 9238; AVX512DQ-NEXT: vpshufb %ymm3, %ymm4, %ymm3 9239; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 9240; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm4 9241; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm2[2],xmm4[3] 9242; AVX512DQ-NEXT: vpshufb %xmm9, %xmm4, %xmm4 9243; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[3,4,5,6,7] 9244; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 9245; AVX512DQ-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 16-byte Folded Reload 9246; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 9247; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9248; AVX512DQ-NEXT: vpblendw $107, (%rsp), %ymm2, %ymm4 # 32-byte Folded Reload 9249; AVX512DQ-NEXT: # ymm4 = mem[0,1],ymm2[2],mem[3],ymm2[4],mem[5,6],ymm2[7],mem[8,9],ymm2[10],mem[11],ymm2[12],mem[13,14],ymm2[15] 9250; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm4[2,3,0,1] 9251; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4],ymm4[5,6],ymm5[7] 9252; AVX512DQ-NEXT: vpshufb %ymm9, %ymm4, %ymm4 9253; AVX512DQ-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 9254; AVX512DQ-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload 9255; AVX512DQ-NEXT: # ymm5 = ymm2[0],mem[1],ymm2[2,3],mem[4],ymm2[5],mem[6],ymm2[7,8],mem[9],ymm2[10,11],mem[12],ymm2[13],mem[14],ymm2[15] 9256; AVX512DQ-NEXT: vextracti128 $1, %ymm5, %xmm6 9257; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3,4],xmm6[5,6,7] 9258; AVX512DQ-NEXT: vpshufb %xmm8, %xmm5, %xmm5 9259; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3,4,5,6,7] 9260; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm3 {%k1} 9261; AVX512DQ-NEXT: vmovdqa64 %ymm19, %ymm2 9262; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4 9263; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15] 9264; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm5 9265; AVX512DQ-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3],xmm4[4,5,6],xmm5[7] 9266; AVX512DQ-NEXT: vpshufb %xmm0, %xmm4, %xmm0 9267; AVX512DQ-NEXT: vextracti64x4 $1, %zmm3, %ymm4 9268; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9269; AVX512DQ-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1,2,3,4,5,6,7],ymm4[8],ymm0[9,10,11,12,13,14,15] 9270; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7] 9271; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm3, %zmm0 9272; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9273; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi) 9274; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9275; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi) 9276; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9277; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx) 9278; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9279; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx) 9280; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9281; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rcx) 9282; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rcx) 9283; AVX512DQ-NEXT: vmovdqa64 %zmm25, 64(%r8) 9284; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload 9285; AVX512DQ-NEXT: vmovaps %zmm2, (%r8) 9286; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r9) 9287; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r9) 9288; AVX512DQ-NEXT: addq $552, %rsp # imm = 0x228 9289; AVX512DQ-NEXT: vzeroupper 9290; AVX512DQ-NEXT: retq 9291; 9292; AVX512DQ-FCP-LABEL: load_i16_stride5_vf64: 9293; AVX512DQ-FCP: # %bb.0: 9294; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228 9295; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [4,5,14,15,4,5,6,7,8,9,10,11,12,13,14,15] 9296; AVX512DQ-FCP-NEXT: vmovdqa 496(%rdi), %xmm1 9297; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9298; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm1 9299; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,10,11,8,9,10,11,8,9,10,11,12,13,14,15] 9300; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %xmm2 9301; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 9302; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 9303; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 9304; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm10 9305; AVX512DQ-FCP-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9306; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %ymm11 9307; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9308; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm7 9309; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9310; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %ymm8 9311; AVX512DQ-FCP-NEXT: vmovdqu %ymm8, (%rsp) # 32-byte Spill 9312; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %ymm4 9313; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %ymm5 9314; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15] 9315; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm28 9316; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm30 9317; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4 9318; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7] 9319; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 9320; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 9321; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 9322; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm9 9323; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm6[0],ymm9[1,2],ymm6[3],ymm9[4],ymm6[5],ymm9[6,7],ymm6[8],ymm9[9,10],ymm6[11],ymm9[12],ymm6[13],ymm9[14,15] 9324; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm9, %ymm26 9325; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9326; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm27 9327; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,0,2,4,6,1,3] 9328; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm6, %ymm5 9329; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm13 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,20,21,128,128,128,128,128,128] 9330; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm5, %ymm5 9331; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm4, %ymm4 9332; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm8[0,1],ymm7[2],ymm8[3],ymm7[4],ymm8[5,6],ymm7[7],ymm8[8,9],ymm7[10],ymm8[11],ymm7[12],ymm8[13,14],ymm7[15] 9333; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7 9334; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7] 9335; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15] 9336; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [2,4,7,1,4,6,0,0] 9337; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm10, %ymm7 9338; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23] 9339; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm7, %ymm7 9340; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6] 9341; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm17, %ymm7 9342; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7] 9343; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm5, %xmm5 9344; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5 9345; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] 9346; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,3,1,3,0,3,5,7] 9347; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm7 9348; AVX512DQ-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9349; AVX512DQ-FCP-NEXT: vpermd %ymm7, %ymm18, %ymm7 9350; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] 9351; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm7, %ymm7 9352; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 9353; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm7 = zmm7 ^ (zmm16 & (zmm7 ^ zmm4)) 9354; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm4 9355; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9356; AVX512DQ-FCP-NEXT: vmovdqa 176(%rdi), %xmm4 9357; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm4, %xmm0 9358; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm31 9359; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %xmm15 9360; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm15, %xmm3 9361; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 9362; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11 9363; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm7 9364; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm11[0],ymm7[1,2],ymm11[3],ymm7[4],ymm11[5],ymm7[6,7],ymm11[8],ymm7[9,10],ymm11[11],ymm7[12],ymm11[13],ymm7[14,15] 9365; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm6, %ymm3 9366; AVX512DQ-FCP-NEXT: vpshufb %ymm13, %ymm3, %ymm3 9367; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 9368; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm6 9369; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm12[0],ymm6[1],ymm12[2,3],ymm6[4],ymm12[5],ymm6[6],ymm12[7,8],ymm6[9],ymm12[10,11],ymm6[12],ymm12[13],ymm6[14],ymm12[15] 9370; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 9371; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1,2,3],xmm4[4,5],xmm5[6,7] 9372; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm1 9373; AVX512DQ-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 9374; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8 9375; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm13 9376; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm13[0],ymm8[1],ymm13[2,3],ymm8[4],ymm13[5],ymm8[6],ymm13[7,8],ymm8[9],ymm13[10,11],ymm8[12],ymm13[13],ymm8[14],ymm13[15] 9377; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm10, %ymm3 9378; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3 9379; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %ymm5 9380; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %ymm9 9381; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm9[0,1],ymm5[2],ymm9[3],ymm5[4],ymm9[5,6],ymm5[7],ymm9[8,9],ymm5[10],ymm9[11],ymm5[12],ymm9[13,14],ymm5[15] 9382; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm25 9383; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm10 9384; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm10[3,4],xmm4[5,6,7] 9385; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 9386; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm17, %ymm3 9387; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 9388; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 9389; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23 9390; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm18, %ymm2 9391; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 9392; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm16 & (zmm2 ^ zmm1)) 9393; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 9394; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9395; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm6[0],ymm12[1],ymm6[2],ymm12[3],ymm6[4,5],ymm12[6],ymm6[7,8],ymm12[9],ymm6[10],ymm12[11],ymm6[12,13],ymm12[14],ymm6[15] 9396; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm21 9397; AVX512DQ-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9398; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm6, %ymm29 9399; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9400; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 9401; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7] 9402; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm17 = [2,0,0,0,4,7,1,6] 9403; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15] 9404; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm19 9405; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm22 9406; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm17, %ymm1 9407; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128] 9408; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm1, %ymm1 9409; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u] 9410; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 9411; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm0, %ymm10 9412; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm31, %xmm0 9413; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm15, %xmm1 9414; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 9415; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm20 = [0,2,5,7,4,7,0,0] 9416; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15] 9417; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, %ymm5 9418; AVX512DQ-FCP-NEXT: vpermd %ymm0, %ymm20, %ymm4 9419; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21] 9420; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm4 9421; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3,4,5,6,7] 9422; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm4[4,5,6,7] 9423; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm24 = [1,4,6,3,1,4,6,3] 9424; AVX512DQ-FCP-NEXT: # ymm24 = mem[0,1,2,3,0,1,2,3] 9425; AVX512DQ-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9426; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm12 9427; AVX512DQ-FCP-NEXT: vmovdqu64 %ymm25, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 9428; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm12[0],ymm9[1,2],ymm12[3],ymm9[4],ymm12[5],ymm9[6,7],ymm12[8],ymm9[9,10],ymm12[11],ymm9[12],ymm12[13],ymm9[14,15] 9429; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm24, %ymm0 9430; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,2,3,2,3,4,5,10,11,0,1,14,15,8,9,16,17,18,19,18,19,20,21,26,27,16,17,30,31,24,25] 9431; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm0, %ymm0 9432; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4],ymm0[5,6,7] 9433; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [1,3,2,3,1,3,6,7] 9434; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm25, %ymm4 9435; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] 9436; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm4, %ymm4 9437; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 ^ (zmm16 & (zmm4 ^ zmm10)) 9438; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm10 9439; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm4, %zmm0 9440; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9441; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm18 9442; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm28 9443; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm0 9444; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm4 9445; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm4[0],ymm0[1],ymm4[2],ymm0[3],ymm4[4,5],ymm0[6],ymm4[7,8],ymm0[9],ymm4[10],ymm0[11],ymm4[12,13],ymm0[14],ymm4[15] 9446; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4 9447; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5,6],xmm4[7] 9448; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm7 9449; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm4 9450; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3],ymm7[4],ymm4[5],ymm7[6],ymm4[7,8],ymm7[9],ymm4[10,11],ymm7[12],ymm4[13],ymm7[14],ymm4[15] 9451; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm17, %ymm4 9452; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 9453; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm0, %ymm0 9454; AVX512DQ-FCP-NEXT: vpor %ymm4, %ymm0, %ymm0 9455; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9456; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm16 # 32-byte Reload 9457; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm4 9458; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm11[0],ymm4[1],ymm11[2,3],ymm4[4],ymm11[5],ymm4[6],ymm11[7,8],ymm4[9],ymm11[10,11],ymm4[12],ymm11[13],ymm4[14],ymm11[15] 9459; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm20, %ymm4 9460; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm4, %ymm3 9461; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload 9462; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm6, %xmm2 9463; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm27 # 16-byte Reload 9464; AVX512DQ-FCP-NEXT: vpsrlq $48, %xmm27, %xmm4 9465; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] 9466; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7] 9467; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7] 9468; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm8 # 32-byte Reload 9469; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9470; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4],ymm4[5],ymm8[6,7],ymm4[8],ymm8[9,10],ymm4[11],ymm8[12],ymm4[13],ymm8[14,15] 9471; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm24, %ymm3 9472; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 9473; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 9474; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Reload 9475; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm25, %ymm2 9476; AVX512DQ-FCP-NEXT: vpshufb %ymm14, %ymm2, %ymm2 9477; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm10 & (zmm2 ^ zmm0)) 9478; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm0 9479; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9480; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm9[0],ymm12[1],ymm9[2,3],ymm12[4],ymm9[5],ymm12[6],ymm9[7,8],ymm12[9],ymm9[10,11],ymm12[12],ymm9[13],ymm12[14],ymm9[15] 9481; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm1 9482; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7] 9483; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] 9484; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm15, %xmm20 9485; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3] 9486; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm24 = [0,3,5,2,5,7,0,0] 9487; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15] 9488; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm5, %ymm30 9489; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm26 9490; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm3 9491; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm12 = [0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31] 9492; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm3, %ymm3 9493; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[3,4,5,6,7] 9494; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm0[0,1,2,3],ymm3[4,5,6,7] 9495; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,8,9,2,3,12,13,6,7,0,1,10,11] 9496; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm1, %xmm1 9497; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1 9498; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4],ymm1[5,6,7] 9499; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm0 9500; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm1 9501; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15] 9502; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm3 9503; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7] 9504; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm25 = [0,2,0,0,5,7,2,4] 9505; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm0 9506; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm1 9507; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15] 9508; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, %ymm29 9509; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm21 9510; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm25, %ymm3 9511; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,6,7,8,9,14,15,4,5,6,7,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23] 9512; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm0 9513; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [4,5,14,15,8,9,2,3,12,13,6,7,12,13,14,15] 9514; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm5, %xmm5 9515; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7] 9516; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,4,6,0,1,4,6,0] 9517; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1] 9518; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm5, %ymm14 9519; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27] 9520; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm14, %ymm14 9521; AVX512DQ-FCP-NEXT: vmovdqa64 {{.*#+}} zmm15 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535] 9522; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 ^ (zmm15 & (zmm14 ^ zmm0)) 9523; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm14, %zmm0 9524; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill 9525; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm0 = ymm8[0],ymm4[1],ymm8[2,3],ymm4[4],ymm8[5],ymm4[6],ymm8[7,8],ymm4[9],ymm8[10,11],ymm4[12],ymm8[13],ymm4[14],ymm8[15] 9526; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm0, %xmm10 9527; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3,4],xmm10[5,6,7] 9528; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm0 9529; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm8 9530; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0],ymm11[1],ymm8[2],ymm11[3],ymm8[4,5],ymm11[6],ymm8[7,8],ymm11[9],ymm8[10],ymm11[11],ymm8[12,13],ymm11[14],ymm8[15] 9531; AVX512DQ-FCP-NEXT: vmovdqa %ymm11, %ymm13 9532; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm24, %ymm2 9533; AVX512DQ-FCP-NEXT: vpshufb %ymm12, %ymm2, %ymm2 9534; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm6[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u] 9535; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm14 9536; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm4[2],xmm27[2],xmm4[3],xmm27[3] 9537; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm6 9538; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2],xmm2[3,4,5,6,7] 9539; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7] 9540; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 9541; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7] 9542; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm11 9543; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm12 9544; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm12[0,1],ymm11[2],ymm12[3],ymm11[4],ymm12[5,6],ymm11[7],ymm12[8,9],ymm11[10],ymm12[11],ymm11[12],ymm12[13,14],ymm11[15] 9545; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 9546; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4],xmm2[5,6,7] 9547; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 9548; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm3 # 32-byte Folded Reload 9549; AVX512DQ-FCP-NEXT: # ymm3 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5],mem[6],ymm7[7,8],mem[9],ymm7[10,11],mem[12],ymm7[13],mem[14],ymm7[15] 9550; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm7, %ymm16 9551; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm25, %ymm3 9552; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm3, %ymm1 9553; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7] 9554; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm2 9555; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2 9556; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 ^ (zmm15 & (zmm2 ^ zmm1)) 9557; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm28 9558; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm15 9559; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm7 9560; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3] 9561; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0] 9562; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm1, %xmm2 9563; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [1,3,6,0,5,0,0,0] 9564; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm9 9565; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm10 9566; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15] 9567; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm18, %ymm4 9568; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21] 9569; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm4, %ymm4 9570; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[3,4,5,6,7] 9571; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7] 9572; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [2,4,7,0,2,4,7,0] 9573; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,0,1] 9574; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm4, %ymm5 9575; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25] 9576; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm5 9577; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm5, %zmm25 9578; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0],xmm14[1],xmm6[2,3] 9579; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm22 9580; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm14, %xmm31 9581; AVX512DQ-FCP-NEXT: vpshufb %xmm0, %xmm2, %xmm0 9582; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm8[0,1],ymm13[2],ymm8[3],ymm13[4],ymm8[5,6],ymm13[7],ymm8[8,9],ymm13[10],ymm8[11],ymm13[12],ymm8[13,14],ymm13[15] 9583; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm13, %ymm20 9584; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm8, %ymm30 9585; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm18, %ymm2 9586; AVX512DQ-FCP-NEXT: vpshufb %ymm1, %ymm2, %ymm1 9587; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3,4,5,6,7] 9588; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] 9589; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm4, %ymm1 9590; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm1, %ymm1 9591; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm24 9592; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3] 9593; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15] 9594; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0] 9595; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm0, %xmm3 9596; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm27 = [1,4,6,3,6,0,0,0] 9597; AVX512DQ-FCP-NEXT: vpermd %ymm1, %ymm27, %ymm1 9598; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31] 9599; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm1, %ymm1 9600; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[3,4,5,6,7] 9601; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7] 9602; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm26 = [0,2,1,3,0,2,5,7] 9603; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm26, %ymm3 9604; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,6,7,4,5,6,7,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31] 9605; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 9606; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm19 9607; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1 9608; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 9609; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 9610; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm14[1,2],ymm4[3],ymm14[4],ymm4[5],ymm14[6,7],ymm4[8],ymm14[9,10],ymm4[11],ymm14[12],ymm4[13],ymm14[14,15] 9611; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm18 9612; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 9613; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3] 9614; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,3,0,0,5,0,2,7] 9615; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, %ymm15 9616; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm10 9617; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15] 9618; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm5 9619; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [2,3,4,5,10,11,0,1,14,15,14,15,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21] 9620; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm5 9621; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [6,7,0,1,10,11,4,5,14,15,8,9,12,13,14,15] 9622; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 9623; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm5[3,4,5,6,7] 9624; AVX512DQ-FCP-NEXT: vpmovsxdq {{.*#+}} zmm5 = [18446744073709551615,18446744073709551615,18446744073709551615,65535,0,0,18446744073709486080,18446744073709551615] 9625; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm25 = zmm25 ^ (zmm5 & (zmm25 ^ zmm3)) 9626; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 9627; AVX512DQ-FCP-NEXT: vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 9628; AVX512DQ-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5],mem[6],ymm3[7,8],mem[9],ymm3[10,11],mem[12],ymm3[13],mem[14],ymm3[15] 9629; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm8 9630; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm8 = xmm3[0],xmm8[1,2,3],xmm3[4,5],xmm8[6,7] 9631; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,0,1,10,11,4,5,14,15,8,9,2,3,12,13] 9632; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm8, %xmm8 9633; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8 9634; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm25, %ymm9 9635; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15] 9636; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7] 9637; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm25, %zmm21 9638; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm11[0],ymm12[1,2],ymm11[3],ymm12[4],ymm11[5],ymm12[6,7],ymm11[8],ymm12[9,10],ymm11[11],ymm12[12],ymm11[13],ymm12[14,15] 9639; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm12, %ymm23 9640; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm11, %ymm25 9641; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm9 9642; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0],xmm8[1],xmm9[2],xmm8[3] 9643; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm7 9644; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 9645; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm16, %ymm12 9646; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm8 = ymm13[0],ymm12[1],ymm13[2],ymm12[3],ymm13[4,5],ymm12[6],ymm13[7,8],ymm12[9],ymm13[10],ymm12[11],ymm13[12,13],ymm12[14],ymm13[15] 9647; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm4 9648; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm4, %ymm4 9649; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3,4,5,6,7] 9650; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 ^ (zmm5 & (zmm24 ^ zmm4)) 9651; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm4 9652; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm5 9653; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3] 9654; AVX512DQ-FCP-NEXT: vpshufb %xmm2, %xmm4, %xmm2 9655; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm20, %ymm4 9656; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm5 9657; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm4 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15] 9658; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm27, %ymm4 9659; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm4, %ymm0 9660; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[3,4,5,6,7] 9661; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7] 9662; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm26, %ymm2 9663; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm19, %ymm4 9664; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2 9665; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0 9666; AVX512DQ-FCP-NEXT: vmovdqu (%rsp), %ymm9 # 32-byte Reload 9667; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 9668; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5],ymm9[6],ymm11[7,8],ymm9[9],ymm11[10,11],ymm9[12],ymm11[13],ymm9[14],ymm11[15] 9669; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4 9670; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4,5],xmm4[6,7] 9671; AVX512DQ-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 9672; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm24, %ymm3 9673; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2 9674; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1,2,3,4,5,6,7],ymm3[8],ymm2[9,10,11,12,13,14,15] 9675; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7] 9676; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm24, %zmm2 9677; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15] 9678; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,0,0,6,0,3,5] 9679; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3 9680; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23] 9681; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 9682; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm6 9683; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm14[0],ymm6[1],ymm14[2,3],ymm6[4],ymm14[5],ymm6[6],ymm14[7,8],ymm6[9],ymm14[10,11],ymm6[12],ymm14[13],ymm6[14],ymm14[15] 9684; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm7 9685; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3,4],xmm7[5,6,7] 9686; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm7 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7] 9687; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm6, %xmm6 9688; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2],ymm3[3,4,5,6,7] 9689; AVX512DQ-FCP-NEXT: movb $7, %al 9690; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 9691; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm1 {%k1} 9692; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm1, %ymm3 9693; AVX512DQ-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 9694; AVX512DQ-FCP-NEXT: vpblendw $181, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 9695; AVX512DQ-FCP-NEXT: # ymm6 = mem[0],ymm6[1],mem[2],ymm6[3],mem[4,5],ymm6[6],mem[7,8],ymm6[9],mem[10],ymm6[11],mem[12,13],ymm6[14],mem[15] 9696; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm8 9697; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5,6],xmm8[7] 9698; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} xmm8 = [6,7,2,3,12,13,6,7,0,1,10,11,4,5,14,15] 9699; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm6, %xmm6 9700; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6 9701; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm3[0],ymm6[1,2,3,4,5,6,7],ymm3[8],ymm6[9,10,11,12,13,14,15] 9702; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm6[4,5,6,7] 9703; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 9704; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm3 9705; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm6 9706; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm3[0],ymm6[1],ymm3[2,3],ymm6[4],ymm3[5],ymm6[6],ymm3[7,8],ymm6[9],ymm3[10,11],ymm6[12],ymm3[13],ymm6[14],ymm3[15] 9707; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm6 9708; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2],xmm3[3,4],xmm6[5,6,7] 9709; AVX512DQ-FCP-NEXT: vpshufb %xmm7, %xmm3, %xmm3 9710; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm6 = ymm13[0,1],ymm12[2],ymm13[3],ymm12[4],ymm13[5,6],ymm12[7],ymm13[8,9],ymm12[10],ymm13[11],ymm12[12],ymm13[13,14],ymm12[15] 9711; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm4, %ymm4 9712; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm4 9713; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2],ymm4[3,4,5,6,7] 9714; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm0 {%k1} 9715; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm9[0],ymm11[1],ymm9[2],ymm11[3],ymm9[4,5],ymm11[6],ymm9[7,8],ymm11[9],ymm9[10],ymm11[11],ymm9[12,13],ymm11[14],ymm9[15] 9716; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4 9717; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5,6],xmm4[7] 9718; AVX512DQ-FCP-NEXT: vpshufb %xmm8, %xmm3, %xmm3 9719; AVX512DQ-FCP-NEXT: vextracti64x4 $1, %zmm0, %ymm4 9720; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 9721; AVX512DQ-FCP-NEXT: vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1,2,3,4,5,6,7],ymm4[8],ymm3[9,10,11,12,13,14,15] 9722; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7] 9723; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 9724; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9725; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rsi) 9726; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9727; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rsi) 9728; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9729; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%rdx) 9730; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9731; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rdx) 9732; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rcx) 9733; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload 9734; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%rcx) 9735; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%r8) 9736; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%r8) 9737; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 9738; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%r9) 9739; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228 9740; AVX512DQ-FCP-NEXT: vzeroupper 9741; AVX512DQ-FCP-NEXT: retq 9742; 9743; AVX512BW-LABEL: load_i16_stride5_vf64: 9744; AVX512BW: # %bb.0: 9745; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm2 9746; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm4 9747; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm3 9748; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5 9749; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm0 9750; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm7 9751; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm9 9752; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm11 9753; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1 9754; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm10 9755; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 9756; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 9757; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13 9758; AVX512BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 9759; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 9760; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm8 9761; AVX512BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 9762; AVX512BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 9763; AVX512BW-NEXT: kmovd %eax, %k1 9764; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} 9765; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 9766; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 9767; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 9768; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 9769; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm6 {%k1} 9770; AVX512BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm6 9771; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 9772; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 9773; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm15 9774; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 9775; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 9776; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm13 9777; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 9778; AVX512BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} 9779; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 9780; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 9781; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 9782; AVX512BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 9783; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} 9784; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm12 9785; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 9786; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 9787; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm17 9788; AVX512BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 9789; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 9790; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm15 9791; AVX512BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 9792; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} 9793; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 9794; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 9795; AVX512BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 9796; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 9797; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} 9798; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm14 9799; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 9800; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 9801; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm17 9802; AVX512BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 9803; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 9804; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm19 9805; AVX512BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 9806; AVX512BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 9807; AVX512BW-NEXT: kmovd %eax, %k1 9808; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} 9809; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 9810; AVX512BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 9811; AVX512BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 9812; AVX512BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 9813; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} 9814; AVX512BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 9815; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 9816; AVX512BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 9817; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 9818; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 9819; AVX512BW-NEXT: vpermt2w %zmm10, %zmm11, %zmm1 9820; AVX512BW-NEXT: movb $7, %al 9821; AVX512BW-NEXT: kmovd %eax, %k1 9822; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 9823; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 9824; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 9825; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 9826; AVX512BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 9827; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 9828; AVX512BW-NEXT: vpermt2w %zmm2, %zmm9, %zmm0 9829; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%rsi) 9830; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi) 9831; AVX512BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) 9832; AVX512BW-NEXT: vmovdqa64 %zmm13, (%rdx) 9833; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) 9834; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx) 9835; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%r8) 9836; AVX512BW-NEXT: vmovdqa64 %zmm19, (%r8) 9837; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r9) 9838; AVX512BW-NEXT: vmovdqa64 %zmm1, (%r9) 9839; AVX512BW-NEXT: vzeroupper 9840; AVX512BW-NEXT: retq 9841; 9842; AVX512BW-FCP-LABEL: load_i16_stride5_vf64: 9843; AVX512BW-FCP: # %bb.0: 9844; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm2 9845; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm4 9846; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 9847; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 9848; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 9849; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 9850; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 9851; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 9852; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 9853; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 9854; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 9855; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 9856; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 9857; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 9858; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 9859; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 9860; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 9861; AVX512BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 9862; AVX512BW-FCP-NEXT: kmovd %eax, %k1 9863; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} 9864; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 9865; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 9866; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 9867; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 9868; AVX512BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm6 {%k1} 9869; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm13, %zmm6 9870; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 9871; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 9872; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 9873; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 9874; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 9875; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 9876; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 9877; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} 9878; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 9879; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 9880; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 9881; AVX512BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 9882; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} 9883; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm12 9884; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 9885; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 9886; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 9887; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 9888; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 9889; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 9890; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 9891; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} 9892; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 9893; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 9894; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 9895; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 9896; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} 9897; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm14 9898; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 9899; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 9900; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 9901; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 9902; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 9903; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 9904; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 9905; AVX512BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 9906; AVX512BW-FCP-NEXT: kmovd %eax, %k1 9907; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} 9908; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 9909; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 9910; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 9911; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 9912; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} 9913; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 9914; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 9915; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 9916; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 9917; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 9918; AVX512BW-FCP-NEXT: vpermt2w %zmm10, %zmm11, %zmm1 9919; AVX512BW-FCP-NEXT: movb $7, %al 9920; AVX512BW-FCP-NEXT: kmovd %eax, %k1 9921; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 9922; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 9923; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 9924; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 9925; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 9926; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 9927; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm9, %zmm0 9928; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rsi) 9929; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 9930; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) 9931; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) 9932; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx) 9933; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 9934; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) 9935; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) 9936; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 9937; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) 9938; AVX512BW-FCP-NEXT: vzeroupper 9939; AVX512BW-FCP-NEXT: retq 9940; 9941; AVX512DQ-BW-LABEL: load_i16_stride5_vf64: 9942; AVX512DQ-BW: # %bb.0: 9943; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm2 9944; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm4 9945; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm3 9946; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5 9947; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm0 9948; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm7 9949; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm9 9950; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm11 9951; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1 9952; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm10 9953; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 9954; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 9955; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13 9956; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 9957; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 9958; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm8 9959; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 9960; AVX512DQ-BW-NEXT: movl $67100672, %eax # imm = 0x3FFE000 9961; AVX512DQ-BW-NEXT: kmovd %eax, %k1 9962; AVX512DQ-BW-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} 9963; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 9964; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 9965; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 9966; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 9967; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm6 {%k1} 9968; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm13, %zmm6 9969; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 9970; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 9971; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm15 9972; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 9973; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 9974; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm13 9975; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 9976; AVX512DQ-BW-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} 9977; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 9978; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 9979; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 9980; AVX512DQ-BW-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 9981; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} 9982; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm12 9983; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 9984; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 9985; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm17 9986; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 9987; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 9988; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm15 9989; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 9990; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} 9991; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 9992; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 9993; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 9994; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 9995; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} 9996; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm14 9997; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 9998; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 9999; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm17 10000; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 10001; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 10002; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm19 10003; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 10004; AVX512DQ-BW-NEXT: movl $33546240, %eax # imm = 0x1FFE000 10005; AVX512DQ-BW-NEXT: kmovd %eax, %k1 10006; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} 10007; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 10008; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 10009; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 10010; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 10011; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} 10012; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 10013; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 10014; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 10015; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 10016; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10017; AVX512DQ-BW-NEXT: vpermt2w %zmm10, %zmm11, %zmm1 10018; AVX512DQ-BW-NEXT: movb $7, %al 10019; AVX512DQ-BW-NEXT: kmovd %eax, %k1 10020; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 10021; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 10022; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 10023; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 10024; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 10025; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 10026; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm9, %zmm0 10027; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%rsi) 10028; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi) 10029; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, 64(%rdx) 10030; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%rdx) 10031; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rcx) 10032; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx) 10033; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%r8) 10034; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%r8) 10035; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r9) 10036; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%r9) 10037; AVX512DQ-BW-NEXT: vzeroupper 10038; AVX512DQ-BW-NEXT: retq 10039; 10040; AVX512DQ-BW-FCP-LABEL: load_i16_stride5_vf64: 10041; AVX512DQ-BW-FCP: # %bb.0: 10042; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm2 10043; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm4 10044; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 10045; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5 10046; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm0 10047; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm7 10048; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm9 10049; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm11 10050; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1 10051; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm10 10052; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11] 10053; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3] 10054; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 10055; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm12, %zmm13 10056; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0] 10057; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 10058; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm6, %zmm8 10059; AVX512DQ-BW-FCP-NEXT: movl $67100672, %eax # imm = 0x3FFE000 10060; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 10061; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm8 {%k1} 10062; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,34,39,44,49,54,59] 10063; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8 10064; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm12 10065; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm6 10066; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm12, %zmm6 {%k1} 10067; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm13, %zmm6 10068; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44] 10069; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3] 10070; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm15 10071; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm15 10072; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0] 10073; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm13 10074; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm12, %zmm13 10075; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm13 {%k1} 10076; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60] 10077; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm13 10078; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm14 10079; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm4, %zmm3, %zmm12 10080; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} 10081; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm12 10082; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45] 10083; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 10084; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm17 10085; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm16, %zmm17 10086; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0] 10087; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm15 10088; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm14, %zmm15 10089; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm15 {%k1} 10090; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61] 10091; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm15 10092; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm5, %zmm16 10093; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm14 10094; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} 10095; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm14 10096; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm16 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14] 10097; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3] 10098; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm17 10099; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm16, %zmm17 10100; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0] 10101; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm19 10102; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm18, %zmm19 10103; AVX512DQ-BW-FCP-NEXT: movl $33546240, %eax # imm = 0x1FFE000 10104; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 10105; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm19 {%k1} 10106; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm17 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62] 10107; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm17, %zmm19 10108; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm0, %zmm16 10109; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm4, %zmm18 10110; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm18 {%k1} 10111; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm17, %zmm18 10112; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0] 10113; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm16, %zmm9 10114; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15] 10115; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3] 10116; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm10, %zmm11, %zmm1 10117; AVX512DQ-BW-FCP-NEXT: movb $7, %al 10118; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 10119; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1 {%k1} 10120; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,33,38,43,48,53,58,63] 10121; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm1 10122; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm0 10123; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm16, %zmm3 10124; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} 10125; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm9, %zmm0 10126; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%rsi) 10127; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi) 10128; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 64(%rdx) 10129; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%rdx) 10130; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx) 10131; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx) 10132; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%r8) 10133; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%r8) 10134; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r9) 10135; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%r9) 10136; AVX512DQ-BW-FCP-NEXT: vzeroupper 10137; AVX512DQ-BW-FCP-NEXT: retq 10138 %wide.vec = load <320 x i16>, ptr %in.vec, align 64 10139 %strided.vec0 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75, i32 80, i32 85, i32 90, i32 95, i32 100, i32 105, i32 110, i32 115, i32 120, i32 125, i32 130, i32 135, i32 140, i32 145, i32 150, i32 155, i32 160, i32 165, i32 170, i32 175, i32 180, i32 185, i32 190, i32 195, i32 200, i32 205, i32 210, i32 215, i32 220, i32 225, i32 230, i32 235, i32 240, i32 245, i32 250, i32 255, i32 260, i32 265, i32 270, i32 275, i32 280, i32 285, i32 290, i32 295, i32 300, i32 305, i32 310, i32 315> 10140 %strided.vec1 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 1, i32 6, i32 11, i32 16, i32 21, i32 26, i32 31, i32 36, i32 41, i32 46, i32 51, i32 56, i32 61, i32 66, i32 71, i32 76, i32 81, i32 86, i32 91, i32 96, i32 101, i32 106, i32 111, i32 116, i32 121, i32 126, i32 131, i32 136, i32 141, i32 146, i32 151, i32 156, i32 161, i32 166, i32 171, i32 176, i32 181, i32 186, i32 191, i32 196, i32 201, i32 206, i32 211, i32 216, i32 221, i32 226, i32 231, i32 236, i32 241, i32 246, i32 251, i32 256, i32 261, i32 266, i32 271, i32 276, i32 281, i32 286, i32 291, i32 296, i32 301, i32 306, i32 311, i32 316> 10141 %strided.vec2 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 2, i32 7, i32 12, i32 17, i32 22, i32 27, i32 32, i32 37, i32 42, i32 47, i32 52, i32 57, i32 62, i32 67, i32 72, i32 77, i32 82, i32 87, i32 92, i32 97, i32 102, i32 107, i32 112, i32 117, i32 122, i32 127, i32 132, i32 137, i32 142, i32 147, i32 152, i32 157, i32 162, i32 167, i32 172, i32 177, i32 182, i32 187, i32 192, i32 197, i32 202, i32 207, i32 212, i32 217, i32 222, i32 227, i32 232, i32 237, i32 242, i32 247, i32 252, i32 257, i32 262, i32 267, i32 272, i32 277, i32 282, i32 287, i32 292, i32 297, i32 302, i32 307, i32 312, i32 317> 10142 %strided.vec3 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 3, i32 8, i32 13, i32 18, i32 23, i32 28, i32 33, i32 38, i32 43, i32 48, i32 53, i32 58, i32 63, i32 68, i32 73, i32 78, i32 83, i32 88, i32 93, i32 98, i32 103, i32 108, i32 113, i32 118, i32 123, i32 128, i32 133, i32 138, i32 143, i32 148, i32 153, i32 158, i32 163, i32 168, i32 173, i32 178, i32 183, i32 188, i32 193, i32 198, i32 203, i32 208, i32 213, i32 218, i32 223, i32 228, i32 233, i32 238, i32 243, i32 248, i32 253, i32 258, i32 263, i32 268, i32 273, i32 278, i32 283, i32 288, i32 293, i32 298, i32 303, i32 308, i32 313, i32 318> 10143 %strided.vec4 = shufflevector <320 x i16> %wide.vec, <320 x i16> poison, <64 x i32> <i32 4, i32 9, i32 14, i32 19, i32 24, i32 29, i32 34, i32 39, i32 44, i32 49, i32 54, i32 59, i32 64, i32 69, i32 74, i32 79, i32 84, i32 89, i32 94, i32 99, i32 104, i32 109, i32 114, i32 119, i32 124, i32 129, i32 134, i32 139, i32 144, i32 149, i32 154, i32 159, i32 164, i32 169, i32 174, i32 179, i32 184, i32 189, i32 194, i32 199, i32 204, i32 209, i32 214, i32 219, i32 224, i32 229, i32 234, i32 239, i32 244, i32 249, i32 254, i32 259, i32 264, i32 269, i32 274, i32 279, i32 284, i32 289, i32 294, i32 299, i32 304, i32 309, i32 314, i32 319> 10144 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64 10145 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64 10146 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64 10147 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64 10148 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64 10149 ret void 10150} 10151