1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved stores. 17 18define void @store_i16_stride2_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 19; SSE-LABEL: store_i16_stride2_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 23; SSE-NEXT: movq %xmm0, (%rdx) 24; SSE-NEXT: retq 25; 26; AVX-LABEL: store_i16_stride2_vf2: 27; AVX: # %bb.0: 28; AVX-NEXT: vmovdqa (%rdi), %xmm0 29; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 30; AVX-NEXT: vmovq %xmm0, (%rdx) 31; AVX-NEXT: retq 32; 33; AVX2-LABEL: store_i16_stride2_vf2: 34; AVX2: # %bb.0: 35; AVX2-NEXT: vmovdqa (%rdi), %xmm0 36; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 37; AVX2-NEXT: vmovq %xmm0, (%rdx) 38; AVX2-NEXT: retq 39; 40; AVX2-FP-LABEL: store_i16_stride2_vf2: 41; AVX2-FP: # %bb.0: 42; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 43; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 44; AVX2-FP-NEXT: vmovq %xmm0, (%rdx) 45; AVX2-FP-NEXT: retq 46; 47; AVX2-FCP-LABEL: store_i16_stride2_vf2: 48; AVX2-FCP: # %bb.0: 49; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 50; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 51; AVX2-FCP-NEXT: vmovq %xmm0, (%rdx) 52; AVX2-FCP-NEXT: retq 53; 54; AVX512-LABEL: store_i16_stride2_vf2: 55; AVX512: # %bb.0: 56; AVX512-NEXT: vmovdqa (%rdi), %xmm0 57; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 58; AVX512-NEXT: vmovq %xmm0, (%rdx) 59; AVX512-NEXT: retq 60; 61; AVX512-FCP-LABEL: store_i16_stride2_vf2: 62; AVX512-FCP: # %bb.0: 63; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 64; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 65; AVX512-FCP-NEXT: vmovq %xmm0, (%rdx) 66; AVX512-FCP-NEXT: retq 67; 68; AVX512DQ-LABEL: store_i16_stride2_vf2: 69; AVX512DQ: # %bb.0: 70; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 71; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 72; AVX512DQ-NEXT: vmovq %xmm0, (%rdx) 73; AVX512DQ-NEXT: retq 74; 75; AVX512DQ-FCP-LABEL: store_i16_stride2_vf2: 76; AVX512DQ-FCP: # %bb.0: 77; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 78; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 79; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rdx) 80; AVX512DQ-FCP-NEXT: retq 81; 82; AVX512BW-LABEL: store_i16_stride2_vf2: 83; AVX512BW: # %bb.0: 84; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 85; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 86; AVX512BW-NEXT: vmovq %xmm0, (%rdx) 87; AVX512BW-NEXT: retq 88; 89; AVX512BW-FCP-LABEL: store_i16_stride2_vf2: 90; AVX512BW-FCP: # %bb.0: 91; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 92; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 93; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rdx) 94; AVX512BW-FCP-NEXT: retq 95; 96; AVX512DQ-BW-LABEL: store_i16_stride2_vf2: 97; AVX512DQ-BW: # %bb.0: 98; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 99; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 100; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rdx) 101; AVX512DQ-BW-NEXT: retq 102; 103; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf2: 104; AVX512DQ-BW-FCP: # %bb.0: 105; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 106; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 107; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rdx) 108; AVX512DQ-BW-FCP-NEXT: retq 109 %in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64 110 %in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64 111 %1 = shufflevector <2 x i16> %in.vec0, <2 x i16> %in.vec1, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 112 %interleaved.vec = shufflevector <4 x i16> %1, <4 x i16> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> 113 store <4 x i16> %interleaved.vec, ptr %out.vec, align 64 114 ret void 115} 116 117define void @store_i16_stride2_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 118; SSE-LABEL: store_i16_stride2_vf4: 119; SSE: # %bb.0: 120; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero 121; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero 122; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 123; SSE-NEXT: movdqa %xmm1, (%rdx) 124; SSE-NEXT: retq 125; 126; AVX-LABEL: store_i16_stride2_vf4: 127; AVX: # %bb.0: 128; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 129; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 130; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 131; AVX-NEXT: vmovdqa %xmm0, (%rdx) 132; AVX-NEXT: retq 133; 134; AVX2-LABEL: store_i16_stride2_vf4: 135; AVX2: # %bb.0: 136; AVX2-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 137; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 138; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 139; AVX2-NEXT: vmovdqa %xmm0, (%rdx) 140; AVX2-NEXT: retq 141; 142; AVX2-FP-LABEL: store_i16_stride2_vf4: 143; AVX2-FP: # %bb.0: 144; AVX2-FP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 145; AVX2-FP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 146; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 147; AVX2-FP-NEXT: vmovdqa %xmm0, (%rdx) 148; AVX2-FP-NEXT: retq 149; 150; AVX2-FCP-LABEL: store_i16_stride2_vf4: 151; AVX2-FCP: # %bb.0: 152; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 153; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 154; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 155; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rdx) 156; AVX2-FCP-NEXT: retq 157; 158; AVX512-LABEL: store_i16_stride2_vf4: 159; AVX512: # %bb.0: 160; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 161; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 162; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 163; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 164; AVX512-NEXT: retq 165; 166; AVX512-FCP-LABEL: store_i16_stride2_vf4: 167; AVX512-FCP: # %bb.0: 168; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 169; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 170; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 171; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) 172; AVX512-FCP-NEXT: retq 173; 174; AVX512DQ-LABEL: store_i16_stride2_vf4: 175; AVX512DQ: # %bb.0: 176; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 177; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 178; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 179; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 180; AVX512DQ-NEXT: retq 181; 182; AVX512DQ-FCP-LABEL: store_i16_stride2_vf4: 183; AVX512DQ-FCP: # %bb.0: 184; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 185; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 186; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 187; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) 188; AVX512DQ-FCP-NEXT: retq 189; 190; AVX512BW-LABEL: store_i16_stride2_vf4: 191; AVX512BW: # %bb.0: 192; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 193; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 194; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 195; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx) 196; AVX512BW-NEXT: retq 197; 198; AVX512BW-FCP-LABEL: store_i16_stride2_vf4: 199; AVX512BW-FCP: # %bb.0: 200; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 201; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 202; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 203; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx) 204; AVX512BW-FCP-NEXT: retq 205; 206; AVX512DQ-BW-LABEL: store_i16_stride2_vf4: 207; AVX512DQ-BW: # %bb.0: 208; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 209; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 210; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 211; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx) 212; AVX512DQ-BW-NEXT: retq 213; 214; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf4: 215; AVX512DQ-BW-FCP: # %bb.0: 216; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero 217; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero 218; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 219; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx) 220; AVX512DQ-BW-FCP-NEXT: retq 221 %in.vec0 = load <4 x i16>, ptr %in.vecptr0, align 64 222 %in.vec1 = load <4 x i16>, ptr %in.vecptr1, align 64 223 %1 = shufflevector <4 x i16> %in.vec0, <4 x i16> %in.vec1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 224 %interleaved.vec = shufflevector <8 x i16> %1, <8 x i16> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 225 store <8 x i16> %interleaved.vec, ptr %out.vec, align 64 226 ret void 227} 228 229define void @store_i16_stride2_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 230; SSE-LABEL: store_i16_stride2_vf8: 231; SSE: # %bb.0: 232; SSE-NEXT: movdqa (%rdi), %xmm0 233; SSE-NEXT: movdqa (%rsi), %xmm1 234; SSE-NEXT: movdqa %xmm0, %xmm2 235; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 236; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 237; SSE-NEXT: movdqa %xmm0, 16(%rdx) 238; SSE-NEXT: movdqa %xmm2, (%rdx) 239; SSE-NEXT: retq 240; 241; AVX-LABEL: store_i16_stride2_vf8: 242; AVX: # %bb.0: 243; AVX-NEXT: vmovdqa (%rdi), %xmm0 244; AVX-NEXT: vmovdqa (%rsi), %xmm1 245; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 246; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 247; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) 248; AVX-NEXT: vmovdqa %xmm2, (%rdx) 249; AVX-NEXT: retq 250; 251; AVX2-LABEL: store_i16_stride2_vf8: 252; AVX2: # %bb.0: 253; AVX2-NEXT: vmovdqa (%rdi), %xmm0 254; AVX2-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 255; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 256; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] 257; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 258; AVX2-NEXT: vzeroupper 259; AVX2-NEXT: retq 260; 261; AVX2-FP-LABEL: store_i16_stride2_vf8: 262; AVX2-FP: # %bb.0: 263; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0 264; AVX2-FP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 265; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 266; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] 267; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx) 268; AVX2-FP-NEXT: vzeroupper 269; AVX2-FP-NEXT: retq 270; 271; AVX2-FCP-LABEL: store_i16_stride2_vf8: 272; AVX2-FCP: # %bb.0: 273; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0 274; AVX2-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 275; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 276; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] 277; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx) 278; AVX2-FCP-NEXT: vzeroupper 279; AVX2-FCP-NEXT: retq 280; 281; AVX512-LABEL: store_i16_stride2_vf8: 282; AVX512: # %bb.0: 283; AVX512-NEXT: vmovdqa (%rdi), %xmm0 284; AVX512-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 285; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 286; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] 287; AVX512-NEXT: vmovdqa %ymm0, (%rdx) 288; AVX512-NEXT: vzeroupper 289; AVX512-NEXT: retq 290; 291; AVX512-FCP-LABEL: store_i16_stride2_vf8: 292; AVX512-FCP: # %bb.0: 293; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 294; AVX512-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 295; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 296; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] 297; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rdx) 298; AVX512-FCP-NEXT: vzeroupper 299; AVX512-FCP-NEXT: retq 300; 301; AVX512DQ-LABEL: store_i16_stride2_vf8: 302; AVX512DQ: # %bb.0: 303; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0 304; AVX512DQ-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 305; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 306; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] 307; AVX512DQ-NEXT: vmovdqa %ymm0, (%rdx) 308; AVX512DQ-NEXT: vzeroupper 309; AVX512DQ-NEXT: retq 310; 311; AVX512DQ-FCP-LABEL: store_i16_stride2_vf8: 312; AVX512DQ-FCP: # %bb.0: 313; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 314; AVX512DQ-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 315; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 316; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15,16,17,24,25,18,19,26,27,20,21,28,29,22,23,30,31] 317; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rdx) 318; AVX512DQ-FCP-NEXT: vzeroupper 319; AVX512DQ-FCP-NEXT: retq 320; 321; AVX512BW-LABEL: store_i16_stride2_vf8: 322; AVX512BW: # %bb.0: 323; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0 324; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 325; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 326; AVX512BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 327; AVX512BW-NEXT: vmovdqa %ymm0, (%rdx) 328; AVX512BW-NEXT: vzeroupper 329; AVX512BW-NEXT: retq 330; 331; AVX512BW-FCP-LABEL: store_i16_stride2_vf8: 332; AVX512BW-FCP: # %bb.0: 333; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 334; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 335; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 336; AVX512BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 337; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) 338; AVX512BW-FCP-NEXT: vzeroupper 339; AVX512BW-FCP-NEXT: retq 340; 341; AVX512DQ-BW-LABEL: store_i16_stride2_vf8: 342; AVX512DQ-BW: # %bb.0: 343; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0 344; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 345; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 346; AVX512DQ-BW-NEXT: vpermw %ymm0, %ymm1, %ymm0 347; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rdx) 348; AVX512DQ-BW-NEXT: vzeroupper 349; AVX512DQ-BW-NEXT: retq 350; 351; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf8: 352; AVX512DQ-BW-FCP: # %bb.0: 353; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 354; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm0, %ymm0 355; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15] 356; AVX512DQ-BW-FCP-NEXT: vpermw %ymm0, %ymm1, %ymm0 357; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rdx) 358; AVX512DQ-BW-FCP-NEXT: vzeroupper 359; AVX512DQ-BW-FCP-NEXT: retq 360 %in.vec0 = load <8 x i16>, ptr %in.vecptr0, align 64 361 %in.vec1 = load <8 x i16>, ptr %in.vecptr1, align 64 362 %1 = shufflevector <8 x i16> %in.vec0, <8 x i16> %in.vec1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 363 %interleaved.vec = shufflevector <16 x i16> %1, <16 x i16> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 364 store <16 x i16> %interleaved.vec, ptr %out.vec, align 64 365 ret void 366} 367 368define void @store_i16_stride2_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 369; SSE-LABEL: store_i16_stride2_vf16: 370; SSE: # %bb.0: 371; SSE-NEXT: movdqa (%rdi), %xmm0 372; SSE-NEXT: movdqa 16(%rdi), %xmm1 373; SSE-NEXT: movdqa (%rsi), %xmm2 374; SSE-NEXT: movdqa 16(%rsi), %xmm3 375; SSE-NEXT: movdqa %xmm0, %xmm4 376; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7] 377; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 378; SSE-NEXT: movdqa %xmm1, %xmm2 379; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 380; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 381; SSE-NEXT: movdqa %xmm1, 32(%rdx) 382; SSE-NEXT: movdqa %xmm2, 48(%rdx) 383; SSE-NEXT: movdqa %xmm0, (%rdx) 384; SSE-NEXT: movdqa %xmm4, 16(%rdx) 385; SSE-NEXT: retq 386; 387; AVX-LABEL: store_i16_stride2_vf16: 388; AVX: # %bb.0: 389; AVX-NEXT: vmovdqa (%rsi), %xmm0 390; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 391; AVX-NEXT: vmovdqa (%rdi), %xmm2 392; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 393; AVX-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 394; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 395; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 396; AVX-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 397; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) 398; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) 399; AVX-NEXT: vmovdqa %xmm0, (%rdx) 400; AVX-NEXT: vmovdqa %xmm4, 16(%rdx) 401; AVX-NEXT: retq 402; 403; AVX2-LABEL: store_i16_stride2_vf16: 404; AVX2: # %bb.0: 405; AVX2-NEXT: vmovdqa (%rdi), %ymm0 406; AVX2-NEXT: vmovdqa (%rsi), %ymm1 407; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 408; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 409; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] 410; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 411; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) 412; AVX2-NEXT: vmovdqa %ymm1, (%rdx) 413; AVX2-NEXT: vzeroupper 414; AVX2-NEXT: retq 415; 416; AVX2-FP-LABEL: store_i16_stride2_vf16: 417; AVX2-FP: # %bb.0: 418; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 419; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm1 420; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 421; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 422; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] 423; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 424; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rdx) 425; AVX2-FP-NEXT: vmovdqa %ymm1, (%rdx) 426; AVX2-FP-NEXT: vzeroupper 427; AVX2-FP-NEXT: retq 428; 429; AVX2-FCP-LABEL: store_i16_stride2_vf16: 430; AVX2-FCP: # %bb.0: 431; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 432; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm1 433; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 434; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 435; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm0[0,1],ymm2[0,1] 436; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] 437; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rdx) 438; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rdx) 439; AVX2-FCP-NEXT: vzeroupper 440; AVX2-FCP-NEXT: retq 441; 442; AVX512-LABEL: store_i16_stride2_vf16: 443; AVX512: # %bb.0: 444; AVX512-NEXT: vmovdqa (%rsi), %xmm0 445; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1 446; AVX512-NEXT: vmovdqa (%rdi), %xmm2 447; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3 448; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 449; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 450; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 451; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 452; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) 453; AVX512-NEXT: vmovdqa %xmm2, 48(%rdx) 454; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 455; AVX512-NEXT: vmovdqa %xmm4, 16(%rdx) 456; AVX512-NEXT: retq 457; 458; AVX512-FCP-LABEL: store_i16_stride2_vf16: 459; AVX512-FCP: # %bb.0: 460; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 461; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 462; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2 463; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 464; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 465; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 466; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 467; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 468; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 469; AVX512-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) 470; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) 471; AVX512-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) 472; AVX512-FCP-NEXT: retq 473; 474; AVX512DQ-LABEL: store_i16_stride2_vf16: 475; AVX512DQ: # %bb.0: 476; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 477; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 478; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm2 479; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3 480; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 481; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 482; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 483; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 484; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) 485; AVX512DQ-NEXT: vmovdqa %xmm2, 48(%rdx) 486; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 487; AVX512DQ-NEXT: vmovdqa %xmm4, 16(%rdx) 488; AVX512DQ-NEXT: retq 489; 490; AVX512DQ-FCP-LABEL: store_i16_stride2_vf16: 491; AVX512DQ-FCP: # %bb.0: 492; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 493; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 494; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2 495; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3 496; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 497; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 498; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 499; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 500; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 501; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 48(%rdx) 502; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) 503; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 16(%rdx) 504; AVX512DQ-FCP-NEXT: retq 505; 506; AVX512BW-LABEL: store_i16_stride2_vf16: 507; AVX512BW: # %bb.0: 508; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0 509; AVX512BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 510; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] 511; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 512; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 513; AVX512BW-NEXT: vzeroupper 514; AVX512BW-NEXT: retq 515; 516; AVX512BW-FCP-LABEL: store_i16_stride2_vf16: 517; AVX512BW-FCP: # %bb.0: 518; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 519; AVX512BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 520; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] 521; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 522; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 523; AVX512BW-FCP-NEXT: vzeroupper 524; AVX512BW-FCP-NEXT: retq 525; 526; AVX512DQ-BW-LABEL: store_i16_stride2_vf16: 527; AVX512DQ-BW: # %bb.0: 528; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0 529; AVX512DQ-BW-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 530; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] 531; AVX512DQ-BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 532; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) 533; AVX512DQ-BW-NEXT: vzeroupper 534; AVX512DQ-BW-NEXT: retq 535; 536; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf16: 537; AVX512DQ-BW-FCP: # %bb.0: 538; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0 539; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0 540; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23,8,24,9,25,10,26,11,27,12,28,13,29,14,30,15,31] 541; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm0 542; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 543; AVX512DQ-BW-FCP-NEXT: vzeroupper 544; AVX512DQ-BW-FCP-NEXT: retq 545 %in.vec0 = load <16 x i16>, ptr %in.vecptr0, align 64 546 %in.vec1 = load <16 x i16>, ptr %in.vecptr1, align 64 547 %1 = shufflevector <16 x i16> %in.vec0, <16 x i16> %in.vec1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 548 %interleaved.vec = shufflevector <32 x i16> %1, <32 x i16> poison, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 549 store <32 x i16> %interleaved.vec, ptr %out.vec, align 64 550 ret void 551} 552 553define void @store_i16_stride2_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 554; SSE-LABEL: store_i16_stride2_vf32: 555; SSE: # %bb.0: 556; SSE-NEXT: movdqa (%rdi), %xmm0 557; SSE-NEXT: movdqa 16(%rdi), %xmm1 558; SSE-NEXT: movdqa 32(%rdi), %xmm2 559; SSE-NEXT: movdqa 48(%rdi), %xmm3 560; SSE-NEXT: movdqa (%rsi), %xmm4 561; SSE-NEXT: movdqa 16(%rsi), %xmm5 562; SSE-NEXT: movdqa 32(%rsi), %xmm6 563; SSE-NEXT: movdqa 48(%rsi), %xmm7 564; SSE-NEXT: movdqa %xmm0, %xmm8 565; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] 566; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 567; SSE-NEXT: movdqa %xmm1, %xmm4 568; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 569; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] 570; SSE-NEXT: movdqa %xmm2, %xmm5 571; SSE-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7] 572; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] 573; SSE-NEXT: movdqa %xmm3, %xmm6 574; SSE-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7] 575; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] 576; SSE-NEXT: movdqa %xmm3, 96(%rdx) 577; SSE-NEXT: movdqa %xmm6, 112(%rdx) 578; SSE-NEXT: movdqa %xmm2, 64(%rdx) 579; SSE-NEXT: movdqa %xmm5, 80(%rdx) 580; SSE-NEXT: movdqa %xmm1, 32(%rdx) 581; SSE-NEXT: movdqa %xmm4, 48(%rdx) 582; SSE-NEXT: movdqa %xmm0, (%rdx) 583; SSE-NEXT: movdqa %xmm8, 16(%rdx) 584; SSE-NEXT: retq 585; 586; AVX-LABEL: store_i16_stride2_vf32: 587; AVX: # %bb.0: 588; AVX-NEXT: vmovdqa (%rsi), %xmm0 589; AVX-NEXT: vmovdqa 16(%rsi), %xmm1 590; AVX-NEXT: vmovdqa 32(%rsi), %xmm2 591; AVX-NEXT: vmovdqa 48(%rsi), %xmm3 592; AVX-NEXT: vmovdqa (%rdi), %xmm4 593; AVX-NEXT: vmovdqa 16(%rdi), %xmm5 594; AVX-NEXT: vmovdqa 32(%rdi), %xmm6 595; AVX-NEXT: vmovdqa 48(%rdi), %xmm7 596; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 597; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 598; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 599; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 600; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 601; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 602; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 603; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 604; AVX-NEXT: vmovdqa %xmm0, (%rdx) 605; AVX-NEXT: vmovdqa %xmm5, 16(%rdx) 606; AVX-NEXT: vmovdqa %xmm1, 32(%rdx) 607; AVX-NEXT: vmovdqa %xmm7, 48(%rdx) 608; AVX-NEXT: vmovdqa %xmm3, 96(%rdx) 609; AVX-NEXT: vmovdqa %xmm6, 112(%rdx) 610; AVX-NEXT: vmovdqa %xmm2, 64(%rdx) 611; AVX-NEXT: vmovdqa %xmm8, 80(%rdx) 612; AVX-NEXT: retq 613; 614; AVX2-LABEL: store_i16_stride2_vf32: 615; AVX2: # %bb.0: 616; AVX2-NEXT: vmovdqa (%rdi), %ymm0 617; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 618; AVX2-NEXT: vmovdqa (%rsi), %ymm2 619; AVX2-NEXT: vmovdqa 32(%rsi), %ymm3 620; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 621; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 622; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] 623; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] 624; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 625; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 626; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] 627; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] 628; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx) 629; AVX2-NEXT: vmovdqa %ymm3, 96(%rdx) 630; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 631; AVX2-NEXT: vmovdqa %ymm2, 32(%rdx) 632; AVX2-NEXT: vzeroupper 633; AVX2-NEXT: retq 634; 635; AVX2-FP-LABEL: store_i16_stride2_vf32: 636; AVX2-FP: # %bb.0: 637; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 638; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 639; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2 640; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm3 641; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 642; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 643; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] 644; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] 645; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 646; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 647; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] 648; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] 649; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx) 650; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rdx) 651; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx) 652; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rdx) 653; AVX2-FP-NEXT: vzeroupper 654; AVX2-FP-NEXT: retq 655; 656; AVX2-FCP-LABEL: store_i16_stride2_vf32: 657; AVX2-FCP: # %bb.0: 658; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 659; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 660; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm2 661; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 662; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 663; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 664; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3],ymm4[2,3] 665; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm4[0,1] 666; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 667; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 668; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm4[2,3] 669; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] 670; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx) 671; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rdx) 672; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx) 673; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rdx) 674; AVX2-FCP-NEXT: vzeroupper 675; AVX2-FCP-NEXT: retq 676; 677; AVX512-LABEL: store_i16_stride2_vf32: 678; AVX512: # %bb.0: 679; AVX512-NEXT: vmovdqa (%rsi), %xmm0 680; AVX512-NEXT: vmovdqa 16(%rsi), %xmm1 681; AVX512-NEXT: vmovdqa 32(%rsi), %xmm2 682; AVX512-NEXT: vmovdqa 48(%rsi), %xmm3 683; AVX512-NEXT: vmovdqa (%rdi), %xmm4 684; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5 685; AVX512-NEXT: vmovdqa 32(%rdi), %xmm6 686; AVX512-NEXT: vmovdqa 48(%rdi), %xmm7 687; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 688; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 689; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 690; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 691; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 692; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 693; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 694; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 695; AVX512-NEXT: vmovdqa %xmm3, 96(%rdx) 696; AVX512-NEXT: vmovdqa %xmm6, 112(%rdx) 697; AVX512-NEXT: vmovdqa %xmm2, 64(%rdx) 698; AVX512-NEXT: vmovdqa %xmm5, 80(%rdx) 699; AVX512-NEXT: vmovdqa %xmm1, 32(%rdx) 700; AVX512-NEXT: vmovdqa %xmm4, 48(%rdx) 701; AVX512-NEXT: vmovdqa %xmm0, (%rdx) 702; AVX512-NEXT: vmovdqa %xmm8, 16(%rdx) 703; AVX512-NEXT: retq 704; 705; AVX512-FCP-LABEL: store_i16_stride2_vf32: 706; AVX512-FCP: # %bb.0: 707; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm0 708; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 709; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 710; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm3 711; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 712; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 713; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 714; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 715; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 716; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 717; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 718; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 719; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 720; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 721; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 722; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 723; AVX512-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) 724; AVX512-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) 725; AVX512-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) 726; AVX512-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) 727; AVX512-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 728; AVX512-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) 729; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx) 730; AVX512-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) 731; AVX512-FCP-NEXT: retq 732; 733; AVX512DQ-LABEL: store_i16_stride2_vf32: 734; AVX512DQ: # %bb.0: 735; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 736; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm1 737; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm2 738; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm3 739; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 740; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5 741; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm6 742; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm7 743; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 744; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 745; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 746; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 747; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 748; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 749; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 750; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 751; AVX512DQ-NEXT: vmovdqa %xmm3, 96(%rdx) 752; AVX512DQ-NEXT: vmovdqa %xmm6, 112(%rdx) 753; AVX512DQ-NEXT: vmovdqa %xmm2, 64(%rdx) 754; AVX512DQ-NEXT: vmovdqa %xmm5, 80(%rdx) 755; AVX512DQ-NEXT: vmovdqa %xmm1, 32(%rdx) 756; AVX512DQ-NEXT: vmovdqa %xmm4, 48(%rdx) 757; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx) 758; AVX512DQ-NEXT: vmovdqa %xmm8, 16(%rdx) 759; AVX512DQ-NEXT: retq 760; 761; AVX512DQ-FCP-LABEL: store_i16_stride2_vf32: 762; AVX512DQ-FCP: # %bb.0: 763; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm0 764; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm1 765; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm2 766; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm3 767; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 768; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5 769; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm6 770; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm7 771; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] 772; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] 773; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] 774; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 775; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] 776; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] 777; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] 778; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3] 779; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 96(%rdx) 780; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 112(%rdx) 781; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 64(%rdx) 782; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 80(%rdx) 783; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 32(%rdx) 784; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 48(%rdx) 785; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx) 786; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 16(%rdx) 787; AVX512DQ-FCP-NEXT: retq 788; 789; AVX512BW-LABEL: store_i16_stride2_vf32: 790; AVX512BW: # %bb.0: 791; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 792; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm1 793; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 794; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 795; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 796; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 797; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) 798; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rdx) 799; AVX512BW-NEXT: vzeroupper 800; AVX512BW-NEXT: retq 801; 802; AVX512BW-FCP-LABEL: store_i16_stride2_vf32: 803; AVX512BW-FCP: # %bb.0: 804; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 805; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 806; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 807; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 808; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 809; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 810; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) 811; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) 812; AVX512BW-FCP-NEXT: vzeroupper 813; AVX512BW-FCP-NEXT: retq 814; 815; AVX512DQ-BW-LABEL: store_i16_stride2_vf32: 816; AVX512DQ-BW: # %bb.0: 817; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 818; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm1 819; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 820; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 821; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 822; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 823; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rdx) 824; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rdx) 825; AVX512DQ-BW-NEXT: vzeroupper 826; AVX512DQ-BW-NEXT: retq 827; 828; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf32: 829; AVX512DQ-BW-FCP: # %bb.0: 830; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 831; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm1 832; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 833; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2 834; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm3 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 835; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm3 836; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rdx) 837; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rdx) 838; AVX512DQ-BW-FCP-NEXT: vzeroupper 839; AVX512DQ-BW-FCP-NEXT: retq 840 %in.vec0 = load <32 x i16>, ptr %in.vecptr0, align 64 841 %in.vec1 = load <32 x i16>, ptr %in.vecptr1, align 64 842 %1 = shufflevector <32 x i16> %in.vec0, <32 x i16> %in.vec1, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63> 843 %interleaved.vec = shufflevector <64 x i16> %1, <64 x i16> poison, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 844 store <64 x i16> %interleaved.vec, ptr %out.vec, align 64 845 ret void 846} 847 848define void @store_i16_stride2_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %out.vec) nounwind { 849; SSE-LABEL: store_i16_stride2_vf64: 850; SSE: # %bb.0: 851; SSE-NEXT: movdqa 112(%rdi), %xmm0 852; SSE-NEXT: movdqa 96(%rdi), %xmm6 853; SSE-NEXT: movdqa 80(%rdi), %xmm4 854; SSE-NEXT: movdqa 64(%rdi), %xmm3 855; SSE-NEXT: movdqa (%rdi), %xmm8 856; SSE-NEXT: movdqa 16(%rdi), %xmm1 857; SSE-NEXT: movdqa 32(%rdi), %xmm2 858; SSE-NEXT: movdqa 48(%rdi), %xmm5 859; SSE-NEXT: movdqa 96(%rsi), %xmm11 860; SSE-NEXT: movdqa 80(%rsi), %xmm12 861; SSE-NEXT: movdqa 64(%rsi), %xmm13 862; SSE-NEXT: movdqa (%rsi), %xmm9 863; SSE-NEXT: movdqa 16(%rsi), %xmm10 864; SSE-NEXT: movdqa 32(%rsi), %xmm14 865; SSE-NEXT: movdqa 48(%rsi), %xmm15 866; SSE-NEXT: movdqa %xmm8, %xmm7 867; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7] 868; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 869; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3] 870; SSE-NEXT: movdqa %xmm1, %xmm9 871; SSE-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7] 872; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1],xmm1[2],xmm10[2],xmm1[3],xmm10[3] 873; SSE-NEXT: movdqa %xmm2, %xmm10 874; SSE-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] 875; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3] 876; SSE-NEXT: movdqa %xmm5, %xmm14 877; SSE-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] 878; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] 879; SSE-NEXT: movdqa %xmm3, %xmm15 880; SSE-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] 881; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm13[0],xmm3[1],xmm13[1],xmm3[2],xmm13[2],xmm3[3],xmm13[3] 882; SSE-NEXT: movdqa %xmm4, %xmm13 883; SSE-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] 884; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3] 885; SSE-NEXT: movdqa %xmm6, %xmm12 886; SSE-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7] 887; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3] 888; SSE-NEXT: movdqa 112(%rsi), %xmm11 889; SSE-NEXT: movdqa %xmm0, %xmm7 890; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7] 891; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] 892; SSE-NEXT: movdqa %xmm0, 224(%rdx) 893; SSE-NEXT: movdqa %xmm7, 240(%rdx) 894; SSE-NEXT: movdqa %xmm6, 192(%rdx) 895; SSE-NEXT: movdqa %xmm12, 208(%rdx) 896; SSE-NEXT: movdqa %xmm4, 160(%rdx) 897; SSE-NEXT: movdqa %xmm13, 176(%rdx) 898; SSE-NEXT: movdqa %xmm3, 128(%rdx) 899; SSE-NEXT: movdqa %xmm15, 144(%rdx) 900; SSE-NEXT: movdqa %xmm5, 96(%rdx) 901; SSE-NEXT: movdqa %xmm14, 112(%rdx) 902; SSE-NEXT: movdqa %xmm2, 64(%rdx) 903; SSE-NEXT: movdqa %xmm10, 80(%rdx) 904; SSE-NEXT: movdqa %xmm1, 32(%rdx) 905; SSE-NEXT: movdqa %xmm9, 48(%rdx) 906; SSE-NEXT: movdqa %xmm8, (%rdx) 907; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 908; SSE-NEXT: movaps %xmm0, 16(%rdx) 909; SSE-NEXT: retq 910; 911; AVX-LABEL: store_i16_stride2_vf64: 912; AVX: # %bb.0: 913; AVX-NEXT: vmovdqa 64(%rsi), %xmm1 914; AVX-NEXT: vmovdqa 64(%rdi), %xmm2 915; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 916; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 917; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 918; AVX-NEXT: vmovdqa 80(%rsi), %xmm3 919; AVX-NEXT: vmovdqa 80(%rdi), %xmm4 920; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 921; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 922; AVX-NEXT: vmovdqa (%rsi), %xmm4 923; AVX-NEXT: vmovdqa 16(%rsi), %xmm5 924; AVX-NEXT: vmovdqa 32(%rsi), %xmm6 925; AVX-NEXT: vmovdqa 48(%rsi), %xmm7 926; AVX-NEXT: vmovdqa (%rdi), %xmm8 927; AVX-NEXT: vmovdqa 16(%rdi), %xmm9 928; AVX-NEXT: vmovdqa 32(%rdi), %xmm10 929; AVX-NEXT: vmovdqa 48(%rdi), %xmm11 930; AVX-NEXT: vpunpckhwd {{.*#+}} xmm12 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] 931; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] 932; AVX-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] 933; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] 934; AVX-NEXT: vmovdqa 96(%rsi), %xmm10 935; AVX-NEXT: vmovdqa 96(%rdi), %xmm13 936; AVX-NEXT: vpunpckhwd {{.*#+}} xmm14 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] 937; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 938; AVX-NEXT: vpunpckhwd {{.*#+}} xmm13 = xmm11[4],xmm7[4],xmm11[5],xmm7[5],xmm11[6],xmm7[6],xmm11[7],xmm7[7] 939; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm7[0],xmm11[1],xmm7[1],xmm11[2],xmm7[2],xmm11[3],xmm7[3] 940; AVX-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7] 941; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3] 942; AVX-NEXT: vmovdqa 112(%rsi), %xmm9 943; AVX-NEXT: vmovdqa 112(%rdi), %xmm15 944; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7] 945; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3] 946; AVX-NEXT: vmovdqa %xmm9, 224(%rdx) 947; AVX-NEXT: vmovdqa %xmm0, 240(%rdx) 948; AVX-NEXT: vmovdqa %xmm5, 32(%rdx) 949; AVX-NEXT: vmovdqa %xmm11, 48(%rdx) 950; AVX-NEXT: vmovdqa %xmm7, 96(%rdx) 951; AVX-NEXT: vmovdqa %xmm13, 112(%rdx) 952; AVX-NEXT: vmovdqa %xmm10, 192(%rdx) 953; AVX-NEXT: vmovdqa %xmm14, 208(%rdx) 954; AVX-NEXT: vmovdqa %xmm6, 64(%rdx) 955; AVX-NEXT: vmovdqa %xmm8, 80(%rdx) 956; AVX-NEXT: vmovdqa %xmm4, (%rdx) 957; AVX-NEXT: vmovdqa %xmm12, 16(%rdx) 958; AVX-NEXT: vmovdqa %xmm3, 160(%rdx) 959; AVX-NEXT: vmovdqa %xmm2, 176(%rdx) 960; AVX-NEXT: vmovdqa %xmm1, 128(%rdx) 961; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 962; AVX-NEXT: vmovaps %xmm0, 144(%rdx) 963; AVX-NEXT: retq 964; 965; AVX2-LABEL: store_i16_stride2_vf64: 966; AVX2: # %bb.0: 967; AVX2-NEXT: vmovdqa (%rdi), %ymm0 968; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 969; AVX2-NEXT: vmovdqa 64(%rdi), %ymm2 970; AVX2-NEXT: vmovdqa 96(%rdi), %ymm3 971; AVX2-NEXT: vmovdqa (%rsi), %ymm4 972; AVX2-NEXT: vmovdqa 32(%rsi), %ymm5 973; AVX2-NEXT: vmovdqa 64(%rsi), %ymm6 974; AVX2-NEXT: vmovdqa 96(%rsi), %ymm7 975; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] 976; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] 977; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] 978; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] 979; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] 980; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] 981; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] 982; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] 983; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] 984; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] 985; AVX2-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] 986; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] 987; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15] 988; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] 989; AVX2-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] 990; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] 991; AVX2-NEXT: vmovdqa %ymm3, 192(%rdx) 992; AVX2-NEXT: vmovdqa %ymm7, 224(%rdx) 993; AVX2-NEXT: vmovdqa %ymm2, 128(%rdx) 994; AVX2-NEXT: vmovdqa %ymm6, 160(%rdx) 995; AVX2-NEXT: vmovdqa %ymm1, 64(%rdx) 996; AVX2-NEXT: vmovdqa %ymm5, 96(%rdx) 997; AVX2-NEXT: vmovdqa %ymm0, (%rdx) 998; AVX2-NEXT: vmovdqa %ymm4, 32(%rdx) 999; AVX2-NEXT: vzeroupper 1000; AVX2-NEXT: retq 1001; 1002; AVX2-FP-LABEL: store_i16_stride2_vf64: 1003; AVX2-FP: # %bb.0: 1004; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm0 1005; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm1 1006; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm2 1007; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm3 1008; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm4 1009; AVX2-FP-NEXT: vmovdqa 32(%rsi), %ymm5 1010; AVX2-FP-NEXT: vmovdqa 64(%rsi), %ymm6 1011; AVX2-FP-NEXT: vmovdqa 96(%rsi), %ymm7 1012; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] 1013; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] 1014; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] 1015; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] 1016; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] 1017; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] 1018; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] 1019; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] 1020; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] 1021; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] 1022; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] 1023; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] 1024; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15] 1025; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] 1026; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] 1027; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] 1028; AVX2-FP-NEXT: vmovdqa %ymm3, 192(%rdx) 1029; AVX2-FP-NEXT: vmovdqa %ymm7, 224(%rdx) 1030; AVX2-FP-NEXT: vmovdqa %ymm2, 128(%rdx) 1031; AVX2-FP-NEXT: vmovdqa %ymm6, 160(%rdx) 1032; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rdx) 1033; AVX2-FP-NEXT: vmovdqa %ymm5, 96(%rdx) 1034; AVX2-FP-NEXT: vmovdqa %ymm0, (%rdx) 1035; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%rdx) 1036; AVX2-FP-NEXT: vzeroupper 1037; AVX2-FP-NEXT: retq 1038; 1039; AVX2-FCP-LABEL: store_i16_stride2_vf64: 1040; AVX2-FCP: # %bb.0: 1041; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm0 1042; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 1043; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm2 1044; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm3 1045; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4 1046; AVX2-FCP-NEXT: vmovdqa 32(%rsi), %ymm5 1047; AVX2-FCP-NEXT: vmovdqa 64(%rsi), %ymm6 1048; AVX2-FCP-NEXT: vmovdqa 96(%rsi), %ymm7 1049; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm0[4],ymm4[4],ymm0[5],ymm4[5],ymm0[6],ymm4[6],ymm0[7],ymm4[7],ymm0[12],ymm4[12],ymm0[13],ymm4[13],ymm0[14],ymm4[14],ymm0[15],ymm4[15] 1050; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[2],ymm4[2],ymm0[3],ymm4[3],ymm0[8],ymm4[8],ymm0[9],ymm4[9],ymm0[10],ymm4[10],ymm0[11],ymm4[11] 1051; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[2,3],ymm8[2,3] 1052; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm8[0,1] 1053; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm1[4],ymm5[4],ymm1[5],ymm5[5],ymm1[6],ymm5[6],ymm1[7],ymm5[7],ymm1[12],ymm5[12],ymm1[13],ymm5[13],ymm1[14],ymm5[14],ymm1[15],ymm5[15] 1054; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[8],ymm5[8],ymm1[9],ymm5[9],ymm1[10],ymm5[10],ymm1[11],ymm5[11] 1055; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[2,3],ymm8[2,3] 1056; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm8[0,1] 1057; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm2[4],ymm6[4],ymm2[5],ymm6[5],ymm2[6],ymm6[6],ymm2[7],ymm6[7],ymm2[12],ymm6[12],ymm2[13],ymm6[13],ymm2[14],ymm6[14],ymm2[15],ymm6[15] 1058; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm6[0],ymm2[1],ymm6[1],ymm2[2],ymm6[2],ymm2[3],ymm6[3],ymm2[8],ymm6[8],ymm2[9],ymm6[9],ymm2[10],ymm6[10],ymm2[11],ymm6[11] 1059; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm2[2,3],ymm8[2,3] 1060; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm2[0,1],ymm8[0,1] 1061; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} ymm8 = ymm3[4],ymm7[4],ymm3[5],ymm7[5],ymm3[6],ymm7[6],ymm3[7],ymm7[7],ymm3[12],ymm7[12],ymm3[13],ymm7[13],ymm3[14],ymm7[14],ymm3[15],ymm7[15] 1062; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm7[0],ymm3[1],ymm7[1],ymm3[2],ymm7[2],ymm3[3],ymm7[3],ymm3[8],ymm7[8],ymm3[9],ymm7[9],ymm3[10],ymm7[10],ymm3[11],ymm7[11] 1063; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm7 = ymm3[2,3],ymm8[2,3] 1064; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm8[0,1] 1065; AVX2-FCP-NEXT: vmovdqa %ymm3, 192(%rdx) 1066; AVX2-FCP-NEXT: vmovdqa %ymm7, 224(%rdx) 1067; AVX2-FCP-NEXT: vmovdqa %ymm2, 128(%rdx) 1068; AVX2-FCP-NEXT: vmovdqa %ymm6, 160(%rdx) 1069; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rdx) 1070; AVX2-FCP-NEXT: vmovdqa %ymm5, 96(%rdx) 1071; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rdx) 1072; AVX2-FCP-NEXT: vmovdqa %ymm4, 32(%rdx) 1073; AVX2-FCP-NEXT: vzeroupper 1074; AVX2-FCP-NEXT: retq 1075; 1076; AVX512-LABEL: store_i16_stride2_vf64: 1077; AVX512: # %bb.0: 1078; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1 1079; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 1080; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1081; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16 1082; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1083; AVX512-NEXT: vmovdqa 80(%rsi), %xmm3 1084; AVX512-NEXT: vmovdqa 80(%rdi), %xmm4 1085; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1086; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 1087; AVX512-NEXT: vmovdqa 96(%rsi), %xmm5 1088; AVX512-NEXT: vmovdqa 96(%rdi), %xmm6 1089; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 1090; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 1091; AVX512-NEXT: vmovdqa 112(%rsi), %xmm6 1092; AVX512-NEXT: vmovdqa 112(%rdi), %xmm7 1093; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1094; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1095; AVX512-NEXT: vmovdqa (%rsi), %xmm7 1096; AVX512-NEXT: vmovdqa 16(%rsi), %xmm9 1097; AVX512-NEXT: vmovdqa 32(%rsi), %xmm10 1098; AVX512-NEXT: vmovdqa 48(%rsi), %xmm11 1099; AVX512-NEXT: vmovdqa (%rdi), %xmm12 1100; AVX512-NEXT: vmovdqa 32(%rdi), %xmm13 1101; AVX512-NEXT: vmovdqa 48(%rdi), %xmm14 1102; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 1103; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] 1104; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] 1105; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1106; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] 1107; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 1108; AVX512-NEXT: vmovdqa 16(%rdi), %xmm12 1109; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] 1110; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] 1111; AVX512-NEXT: vmovdqa %xmm9, 48(%rdx) 1112; AVX512-NEXT: vmovdqa %xmm0, 32(%rdx) 1113; AVX512-NEXT: vmovdqa %xmm7, 16(%rdx) 1114; AVX512-NEXT: vmovdqa %xmm14, (%rdx) 1115; AVX512-NEXT: vmovdqa %xmm11, 112(%rdx) 1116; AVX512-NEXT: vmovdqa %xmm13, 96(%rdx) 1117; AVX512-NEXT: vmovdqa %xmm10, 80(%rdx) 1118; AVX512-NEXT: vmovdqa %xmm15, 64(%rdx) 1119; AVX512-NEXT: vmovdqa %xmm6, 240(%rdx) 1120; AVX512-NEXT: vmovdqa %xmm8, 224(%rdx) 1121; AVX512-NEXT: vmovdqa %xmm5, 208(%rdx) 1122; AVX512-NEXT: vmovdqa %xmm4, 192(%rdx) 1123; AVX512-NEXT: vmovdqa %xmm3, 176(%rdx) 1124; AVX512-NEXT: vmovdqa %xmm2, 160(%rdx) 1125; AVX512-NEXT: vmovdqa %xmm1, 144(%rdx) 1126; AVX512-NEXT: vmovdqa64 %xmm16, 128(%rdx) 1127; AVX512-NEXT: retq 1128; 1129; AVX512-FCP-LABEL: store_i16_stride2_vf64: 1130; AVX512-FCP: # %bb.0: 1131; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 1132; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 1133; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1134; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 1135; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1136; AVX512-FCP-NEXT: vmovdqa 80(%rsi), %xmm3 1137; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 1138; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1139; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 1140; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm5 1141; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 1142; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 1143; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 1144; AVX512-FCP-NEXT: vmovdqa 112(%rsi), %xmm6 1145; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm7 1146; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1147; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1148; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm7 1149; AVX512-FCP-NEXT: vmovdqa 16(%rsi), %xmm9 1150; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 1151; AVX512-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 1152; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm12 1153; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 1154; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm14 1155; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 1156; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] 1157; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] 1158; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1159; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] 1160; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 1161; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 1162; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] 1163; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] 1164; AVX512-FCP-NEXT: vmovdqa %xmm9, 48(%rdx) 1165; AVX512-FCP-NEXT: vmovdqa %xmm0, 32(%rdx) 1166; AVX512-FCP-NEXT: vmovdqa %xmm7, 16(%rdx) 1167; AVX512-FCP-NEXT: vmovdqa %xmm14, (%rdx) 1168; AVX512-FCP-NEXT: vmovdqa %xmm11, 112(%rdx) 1169; AVX512-FCP-NEXT: vmovdqa %xmm13, 96(%rdx) 1170; AVX512-FCP-NEXT: vmovdqa %xmm10, 80(%rdx) 1171; AVX512-FCP-NEXT: vmovdqa %xmm15, 64(%rdx) 1172; AVX512-FCP-NEXT: vmovdqa %xmm6, 240(%rdx) 1173; AVX512-FCP-NEXT: vmovdqa %xmm8, 224(%rdx) 1174; AVX512-FCP-NEXT: vmovdqa %xmm5, 208(%rdx) 1175; AVX512-FCP-NEXT: vmovdqa %xmm4, 192(%rdx) 1176; AVX512-FCP-NEXT: vmovdqa %xmm3, 176(%rdx) 1177; AVX512-FCP-NEXT: vmovdqa %xmm2, 160(%rdx) 1178; AVX512-FCP-NEXT: vmovdqa %xmm1, 144(%rdx) 1179; AVX512-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx) 1180; AVX512-FCP-NEXT: retq 1181; 1182; AVX512DQ-LABEL: store_i16_stride2_vf64: 1183; AVX512DQ: # %bb.0: 1184; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1 1185; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 1186; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1187; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16 1188; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1189; AVX512DQ-NEXT: vmovdqa 80(%rsi), %xmm3 1190; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm4 1191; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1192; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 1193; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm5 1194; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm6 1195; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 1196; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 1197; AVX512DQ-NEXT: vmovdqa 112(%rsi), %xmm6 1198; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm7 1199; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1200; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1201; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7 1202; AVX512DQ-NEXT: vmovdqa 16(%rsi), %xmm9 1203; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm10 1204; AVX512DQ-NEXT: vmovdqa 48(%rsi), %xmm11 1205; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 1206; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm13 1207; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm14 1208; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 1209; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] 1210; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] 1211; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1212; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] 1213; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 1214; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm12 1215; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] 1216; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] 1217; AVX512DQ-NEXT: vmovdqa %xmm9, 48(%rdx) 1218; AVX512DQ-NEXT: vmovdqa %xmm0, 32(%rdx) 1219; AVX512DQ-NEXT: vmovdqa %xmm7, 16(%rdx) 1220; AVX512DQ-NEXT: vmovdqa %xmm14, (%rdx) 1221; AVX512DQ-NEXT: vmovdqa %xmm11, 112(%rdx) 1222; AVX512DQ-NEXT: vmovdqa %xmm13, 96(%rdx) 1223; AVX512DQ-NEXT: vmovdqa %xmm10, 80(%rdx) 1224; AVX512DQ-NEXT: vmovdqa %xmm15, 64(%rdx) 1225; AVX512DQ-NEXT: vmovdqa %xmm6, 240(%rdx) 1226; AVX512DQ-NEXT: vmovdqa %xmm8, 224(%rdx) 1227; AVX512DQ-NEXT: vmovdqa %xmm5, 208(%rdx) 1228; AVX512DQ-NEXT: vmovdqa %xmm4, 192(%rdx) 1229; AVX512DQ-NEXT: vmovdqa %xmm3, 176(%rdx) 1230; AVX512DQ-NEXT: vmovdqa %xmm2, 160(%rdx) 1231; AVX512DQ-NEXT: vmovdqa %xmm1, 144(%rdx) 1232; AVX512DQ-NEXT: vmovdqa64 %xmm16, 128(%rdx) 1233; AVX512DQ-NEXT: retq 1234; 1235; AVX512DQ-FCP-LABEL: store_i16_stride2_vf64: 1236; AVX512DQ-FCP: # %bb.0: 1237; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm1 1238; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2 1239; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] 1240; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16 1241; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] 1242; AVX512DQ-FCP-NEXT: vmovdqa 80(%rsi), %xmm3 1243; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm4 1244; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 1245; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 1246; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm5 1247; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm6 1248; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] 1249; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] 1250; AVX512DQ-FCP-NEXT: vmovdqa 112(%rsi), %xmm6 1251; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm7 1252; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 1253; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 1254; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm7 1255; AVX512DQ-FCP-NEXT: vmovdqa 16(%rsi), %xmm9 1256; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm10 1257; AVX512DQ-FCP-NEXT: vmovdqa 48(%rsi), %xmm11 1258; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm12 1259; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm13 1260; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm14 1261; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3] 1262; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7] 1263; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3] 1264; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] 1265; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3] 1266; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm12[4],xmm7[4],xmm12[5],xmm7[5],xmm12[6],xmm7[6],xmm12[7],xmm7[7] 1267; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm12 1268; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] 1269; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] 1270; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, 48(%rdx) 1271; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 32(%rdx) 1272; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, 16(%rdx) 1273; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, (%rdx) 1274; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, 112(%rdx) 1275; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, 96(%rdx) 1276; AVX512DQ-FCP-NEXT: vmovdqa %xmm10, 80(%rdx) 1277; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, 64(%rdx) 1278; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, 240(%rdx) 1279; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, 224(%rdx) 1280; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, 208(%rdx) 1281; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, 192(%rdx) 1282; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, 176(%rdx) 1283; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, 160(%rdx) 1284; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, 144(%rdx) 1285; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, 128(%rdx) 1286; AVX512DQ-FCP-NEXT: retq 1287; 1288; AVX512BW-LABEL: store_i16_stride2_vf64: 1289; AVX512BW: # %bb.0: 1290; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1291; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1292; AVX512BW-NEXT: vmovdqa64 (%rsi), %zmm2 1293; AVX512BW-NEXT: vmovdqa64 64(%rsi), %zmm3 1294; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 1295; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm5 1296; AVX512BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 1297; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 1298; AVX512BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 1299; AVX512BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 1300; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 1301; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rdx) 1302; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) 1303; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) 1304; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) 1305; AVX512BW-NEXT: vzeroupper 1306; AVX512BW-NEXT: retq 1307; 1308; AVX512BW-FCP-LABEL: store_i16_stride2_vf64: 1309; AVX512BW-FCP: # %bb.0: 1310; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1311; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1312; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 1313; AVX512BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 1314; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 1315; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 1316; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 1317; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 1318; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 1319; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 1320; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 1321; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx) 1322; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) 1323; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 1324; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) 1325; AVX512BW-FCP-NEXT: vzeroupper 1326; AVX512BW-FCP-NEXT: retq 1327; 1328; AVX512DQ-BW-LABEL: store_i16_stride2_vf64: 1329; AVX512DQ-BW: # %bb.0: 1330; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 1331; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1332; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %zmm2 1333; AVX512DQ-BW-NEXT: vmovdqa64 64(%rsi), %zmm3 1334; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 1335; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm5 1336; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 1337; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 1338; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 1339; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 1340; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 1341; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rdx) 1342; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%rdx) 1343; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rdx) 1344; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rdx) 1345; AVX512DQ-BW-NEXT: vzeroupper 1346; AVX512DQ-BW-NEXT: retq 1347; 1348; AVX512DQ-BW-FCP-LABEL: store_i16_stride2_vf64: 1349; AVX512DQ-BW-FCP: # %bb.0: 1350; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1351; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1352; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %zmm2 1353; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rsi), %zmm3 1354; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm4 = [16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63] 1355; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm5 1356; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm4, %zmm5 1357; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47] 1358; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm6, %zmm0 1359; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm1, %zmm4 1360; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm1 1361; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rdx) 1362; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%rdx) 1363; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rdx) 1364; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rdx) 1365; AVX512DQ-BW-FCP-NEXT: vzeroupper 1366; AVX512DQ-BW-FCP-NEXT: retq 1367 %in.vec0 = load <64 x i16>, ptr %in.vecptr0, align 64 1368 %in.vec1 = load <64 x i16>, ptr %in.vecptr1, align 64 1369 %1 = shufflevector <64 x i16> %in.vec0, <64 x i16> %in.vec1, <128 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 75, i32 76, i32 77, i32 78, i32 79, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 91, i32 92, i32 93, i32 94, i32 95, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 107, i32 108, i32 109, i32 110, i32 111, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122, i32 123, i32 124, i32 125, i32 126, i32 127> 1370 %interleaved.vec = shufflevector <128 x i16> %1, <128 x i16> poison, <128 x i32> <i32 0, i32 64, i32 1, i32 65, i32 2, i32 66, i32 3, i32 67, i32 4, i32 68, i32 5, i32 69, i32 6, i32 70, i32 7, i32 71, i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 16, i32 80, i32 17, i32 81, i32 18, i32 82, i32 19, i32 83, i32 20, i32 84, i32 21, i32 85, i32 22, i32 86, i32 23, i32 87, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 32, i32 96, i32 33, i32 97, i32 34, i32 98, i32 35, i32 99, i32 36, i32 100, i32 37, i32 101, i32 38, i32 102, i32 39, i32 103, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 48, i32 112, i32 49, i32 113, i32 50, i32 114, i32 51, i32 115, i32 52, i32 116, i32 53, i32 117, i32 54, i32 118, i32 55, i32 119, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127> 1371 store <128 x i16> %interleaved.vec, ptr %out.vec, align 64 1372 ret void 1373} 1374