1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX 4; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2 5; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP 6; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP 7; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512 8; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP 9; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ 10; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP 11; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW 12; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP 13; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW 14; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP 15 16; These patterns are produced by LoopVectorizer for interleaved loads. 17 18define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { 19; SSE-LABEL: load_i32_stride3_vf2: 20; SSE: # %bb.0: 21; SSE-NEXT: movdqa (%rdi), %xmm0 22; SSE-NEXT: movdqa 16(%rdi), %xmm1 23; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 24; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,1,1] 25; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] 26; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 27; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 28; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 29; SSE-NEXT: movq %xmm2, (%rsi) 30; SSE-NEXT: movq %xmm3, (%rdx) 31; SSE-NEXT: movq %xmm0, (%rcx) 32; SSE-NEXT: retq 33; 34; AVX-LABEL: load_i32_stride3_vf2: 35; AVX: # %bb.0: 36; AVX-NEXT: vmovaps (%rdi), %xmm0 37; AVX-NEXT: vmovaps 16(%rdi), %xmm1 38; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 39; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0],xmm0[1],xmm1[2,3] 40; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[1,0,2,3] 41; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 42; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 43; AVX-NEXT: vmovlps %xmm2, (%rsi) 44; AVX-NEXT: vmovlps %xmm3, (%rdx) 45; AVX-NEXT: vmovlps %xmm0, (%rcx) 46; AVX-NEXT: retq 47; 48; AVX2-LABEL: load_i32_stride3_vf2: 49; AVX2: # %bb.0: 50; AVX2-NEXT: vmovaps (%rdi), %xmm0 51; AVX2-NEXT: vmovaps 16(%rdi), %xmm1 52; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 53; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 54; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 55; AVX2-NEXT: vbroadcastss 8(%rdi), %xmm3 56; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] 57; AVX2-NEXT: vmovlps %xmm2, (%rsi) 58; AVX2-NEXT: vmovlps %xmm0, (%rdx) 59; AVX2-NEXT: vmovlps %xmm1, (%rcx) 60; AVX2-NEXT: retq 61; 62; AVX2-FP-LABEL: load_i32_stride3_vf2: 63; AVX2-FP: # %bb.0: 64; AVX2-FP-NEXT: vmovaps (%rdi), %xmm0 65; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm1 66; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 67; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 68; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 69; AVX2-FP-NEXT: vbroadcastss 8(%rdi), %xmm3 70; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] 71; AVX2-FP-NEXT: vmovlps %xmm2, (%rsi) 72; AVX2-FP-NEXT: vmovlps %xmm0, (%rdx) 73; AVX2-FP-NEXT: vmovlps %xmm1, (%rcx) 74; AVX2-FP-NEXT: retq 75; 76; AVX2-FCP-LABEL: load_i32_stride3_vf2: 77; AVX2-FCP: # %bb.0: 78; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm0 79; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm1 80; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 81; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 82; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 83; AVX2-FCP-NEXT: vbroadcastss 8(%rdi), %xmm3 84; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] 85; AVX2-FCP-NEXT: vmovlps %xmm2, (%rsi) 86; AVX2-FCP-NEXT: vmovlps %xmm0, (%rdx) 87; AVX2-FCP-NEXT: vmovlps %xmm1, (%rcx) 88; AVX2-FCP-NEXT: retq 89; 90; AVX512-LABEL: load_i32_stride3_vf2: 91; AVX512: # %bb.0: 92; AVX512-NEXT: vmovaps (%rdi), %xmm0 93; AVX512-NEXT: vmovaps 16(%rdi), %xmm1 94; AVX512-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 95; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 96; AVX512-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 97; AVX512-NEXT: vbroadcastss 8(%rdi), %xmm3 98; AVX512-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] 99; AVX512-NEXT: vmovlps %xmm2, (%rsi) 100; AVX512-NEXT: vmovlps %xmm0, (%rdx) 101; AVX512-NEXT: vmovlps %xmm1, (%rcx) 102; AVX512-NEXT: retq 103; 104; AVX512-FCP-LABEL: load_i32_stride3_vf2: 105; AVX512-FCP: # %bb.0: 106; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0 107; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 108; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 109; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] 110; AVX512-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 111; AVX512-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 112; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 113; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi) 114; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx) 115; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx) 116; AVX512-FCP-NEXT: retq 117; 118; AVX512DQ-LABEL: load_i32_stride3_vf2: 119; AVX512DQ: # %bb.0: 120; AVX512DQ-NEXT: vmovaps (%rdi), %xmm0 121; AVX512DQ-NEXT: vmovaps 16(%rdi), %xmm1 122; AVX512DQ-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 123; AVX512DQ-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 124; AVX512DQ-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 125; AVX512DQ-NEXT: vbroadcastss 8(%rdi), %xmm3 126; AVX512DQ-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] 127; AVX512DQ-NEXT: vmovlps %xmm2, (%rsi) 128; AVX512DQ-NEXT: vmovlps %xmm0, (%rdx) 129; AVX512DQ-NEXT: vmovlps %xmm1, (%rcx) 130; AVX512DQ-NEXT: retq 131; 132; AVX512DQ-FCP-LABEL: load_i32_stride3_vf2: 133; AVX512DQ-FCP: # %bb.0: 134; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0 135; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 136; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 137; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] 138; AVX512DQ-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 139; AVX512DQ-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 140; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 141; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi) 142; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx) 143; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx) 144; AVX512DQ-FCP-NEXT: retq 145; 146; AVX512BW-LABEL: load_i32_stride3_vf2: 147; AVX512BW: # %bb.0: 148; AVX512BW-NEXT: vmovaps (%rdi), %xmm0 149; AVX512BW-NEXT: vmovaps 16(%rdi), %xmm1 150; AVX512BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 151; AVX512BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 152; AVX512BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 153; AVX512BW-NEXT: vbroadcastss 8(%rdi), %xmm3 154; AVX512BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] 155; AVX512BW-NEXT: vmovlps %xmm2, (%rsi) 156; AVX512BW-NEXT: vmovlps %xmm0, (%rdx) 157; AVX512BW-NEXT: vmovlps %xmm1, (%rcx) 158; AVX512BW-NEXT: retq 159; 160; AVX512BW-FCP-LABEL: load_i32_stride3_vf2: 161; AVX512BW-FCP: # %bb.0: 162; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 163; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 164; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 165; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] 166; AVX512BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 167; AVX512BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 168; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 169; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi) 170; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx) 171; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx) 172; AVX512BW-FCP-NEXT: retq 173; 174; AVX512DQ-BW-LABEL: load_i32_stride3_vf2: 175; AVX512DQ-BW: # %bb.0: 176; AVX512DQ-BW-NEXT: vmovaps (%rdi), %xmm0 177; AVX512DQ-BW-NEXT: vmovaps 16(%rdi), %xmm1 178; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,3,2,3] 179; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3] 180; AVX512DQ-BW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,2,3] 181; AVX512DQ-BW-NEXT: vbroadcastss 8(%rdi), %xmm3 182; AVX512DQ-BW-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3] 183; AVX512DQ-BW-NEXT: vmovlps %xmm2, (%rsi) 184; AVX512DQ-BW-NEXT: vmovlps %xmm0, (%rdx) 185; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rcx) 186; AVX512DQ-BW-NEXT: retq 187; 188; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf2: 189; AVX512DQ-BW-FCP: # %bb.0: 190; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0 191; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1 192; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3] 193; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [5,0] 194; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm0, %xmm1, %xmm3 195; AVX512DQ-BW-FCP-NEXT: vpbroadcastd 8(%rdi), %xmm0 196; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 197; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi) 198; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx) 199; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx) 200; AVX512DQ-BW-FCP-NEXT: retq 201 %wide.vec = load <6 x i32>, ptr %in.vec, align 64 202 %strided.vec0 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 0, i32 3> 203 %strided.vec1 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 1, i32 4> 204 %strided.vec2 = shufflevector <6 x i32> %wide.vec, <6 x i32> poison, <2 x i32> <i32 2, i32 5> 205 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64 206 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64 207 store <2 x i32> %strided.vec2, ptr %out.vec2, align 64 208 ret void 209} 210 211define void @load_i32_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { 212; SSE-LABEL: load_i32_stride3_vf4: 213; SSE: # %bb.0: 214; SSE-NEXT: movdqa (%rdi), %xmm0 215; SSE-NEXT: movaps 16(%rdi), %xmm1 216; SSE-NEXT: movaps 32(%rdi), %xmm2 217; SSE-NEXT: movdqa %xmm0, %xmm3 218; SSE-NEXT: movaps %xmm1, %xmm4 219; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm0[2,3,2,3] 220; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 221; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm1[1,1,1,1] 222; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[1,0] 223; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,2] 224; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm2[2,3] 225; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm4[0,2] 226; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] 227; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,3] 228; SSE-NEXT: movaps %xmm3, (%rsi) 229; SSE-NEXT: movaps %xmm0, (%rdx) 230; SSE-NEXT: movaps %xmm5, (%rcx) 231; SSE-NEXT: retq 232; 233; AVX-LABEL: load_i32_stride3_vf4: 234; AVX: # %bb.0: 235; AVX-NEXT: vmovaps (%rdi), %xmm0 236; AVX-NEXT: vmovaps 16(%rdi), %xmm1 237; AVX-NEXT: vmovaps 32(%rdi), %xmm2 238; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0],xmm2[1],xmm0[2,3] 239; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2],xmm3[3] 240; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm3[0,3,2,1] 241; AVX-NEXT: vmovaps 32(%rdi), %xmm4 242; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm4[2,3] 243; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0],xmm4[1,2],xmm1[3] 244; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm4[1,0,3,2] 245; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3] 246; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] 247; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,1,0,3] 248; AVX-NEXT: vmovaps %xmm3, (%rsi) 249; AVX-NEXT: vmovaps %xmm4, (%rdx) 250; AVX-NEXT: vmovaps %xmm0, (%rcx) 251; AVX-NEXT: retq 252; 253; AVX2-LABEL: load_i32_stride3_vf4: 254; AVX2: # %bb.0: 255; AVX2-NEXT: vmovaps (%rdi), %ymm0 256; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 257; AVX2-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1] 258; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] 259; AVX2-NEXT: vpermps %ymm3, %ymm2, %ymm2 260; AVX2-NEXT: vmovaps {{.*#+}} xmm3 = [1,4,7,2] 261; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 262; AVX2-NEXT: vpermps %ymm4, %ymm3, %ymm3 263; AVX2-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,0,3] 264; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7] 265; AVX2-NEXT: vpermps %ymm0, %ymm4, %ymm0 266; AVX2-NEXT: vmovaps %xmm2, (%rsi) 267; AVX2-NEXT: vmovaps %xmm3, (%rdx) 268; AVX2-NEXT: vmovaps %xmm0, (%rcx) 269; AVX2-NEXT: vzeroupper 270; AVX2-NEXT: retq 271; 272; AVX2-FP-LABEL: load_i32_stride3_vf4: 273; AVX2-FP: # %bb.0: 274; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 275; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 276; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1] 277; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] 278; AVX2-FP-NEXT: vpermps %ymm3, %ymm2, %ymm2 279; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm3 = [1,4,7,2] 280; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 281; AVX2-FP-NEXT: vpermps %ymm4, %ymm3, %ymm3 282; AVX2-FP-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,0,3] 283; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7] 284; AVX2-FP-NEXT: vpermps %ymm0, %ymm4, %ymm0 285; AVX2-FP-NEXT: vmovaps %xmm2, (%rsi) 286; AVX2-FP-NEXT: vmovaps %xmm3, (%rdx) 287; AVX2-FP-NEXT: vmovaps %xmm0, (%rcx) 288; AVX2-FP-NEXT: vzeroupper 289; AVX2-FP-NEXT: retq 290; 291; AVX2-FCP-LABEL: load_i32_stride3_vf4: 292; AVX2-FCP: # %bb.0: 293; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 294; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 295; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm2 = [0,3,6,1] 296; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3,4,5,6,7] 297; AVX2-FCP-NEXT: vpermps %ymm3, %ymm2, %ymm2 298; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm3 = [1,4,7,2] 299; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7] 300; AVX2-FCP-NEXT: vpermps %ymm4, %ymm3, %ymm3 301; AVX2-FCP-NEXT: vmovaps {{.*#+}} xmm4 = [2,5,0,3] 302; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4,5,6,7] 303; AVX2-FCP-NEXT: vpermps %ymm0, %ymm4, %ymm0 304; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi) 305; AVX2-FCP-NEXT: vmovaps %xmm3, (%rdx) 306; AVX2-FCP-NEXT: vmovaps %xmm0, (%rcx) 307; AVX2-FCP-NEXT: vzeroupper 308; AVX2-FCP-NEXT: retq 309; 310; AVX512-LABEL: load_i32_stride3_vf4: 311; AVX512: # %bb.0: 312; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 313; AVX512-NEXT: vmovaps (%rdi), %zmm1 314; AVX512-NEXT: vpermps %zmm1, %zmm0, %zmm0 315; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 316; AVX512-NEXT: vpermps %zmm1, %zmm2, %zmm2 317; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 318; AVX512-NEXT: vpermps %zmm1, %zmm3, %zmm1 319; AVX512-NEXT: vmovaps %xmm0, (%rsi) 320; AVX512-NEXT: vmovaps %xmm2, (%rdx) 321; AVX512-NEXT: vmovaps %xmm1, (%rcx) 322; AVX512-NEXT: vzeroupper 323; AVX512-NEXT: retq 324; 325; AVX512-FCP-LABEL: load_i32_stride3_vf4: 326; AVX512-FCP: # %bb.0: 327; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 328; AVX512-FCP-NEXT: vmovaps (%rdi), %zmm1 329; AVX512-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 330; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 331; AVX512-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 332; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 333; AVX512-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 334; AVX512-FCP-NEXT: vmovaps %xmm0, (%rsi) 335; AVX512-FCP-NEXT: vmovaps %xmm2, (%rdx) 336; AVX512-FCP-NEXT: vmovaps %xmm1, (%rcx) 337; AVX512-FCP-NEXT: vzeroupper 338; AVX512-FCP-NEXT: retq 339; 340; AVX512DQ-LABEL: load_i32_stride3_vf4: 341; AVX512DQ: # %bb.0: 342; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 343; AVX512DQ-NEXT: vmovaps (%rdi), %zmm1 344; AVX512DQ-NEXT: vpermps %zmm1, %zmm0, %zmm0 345; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 346; AVX512DQ-NEXT: vpermps %zmm1, %zmm2, %zmm2 347; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 348; AVX512DQ-NEXT: vpermps %zmm1, %zmm3, %zmm1 349; AVX512DQ-NEXT: vmovaps %xmm0, (%rsi) 350; AVX512DQ-NEXT: vmovaps %xmm2, (%rdx) 351; AVX512DQ-NEXT: vmovaps %xmm1, (%rcx) 352; AVX512DQ-NEXT: vzeroupper 353; AVX512DQ-NEXT: retq 354; 355; AVX512DQ-FCP-LABEL: load_i32_stride3_vf4: 356; AVX512DQ-FCP: # %bb.0: 357; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 358; AVX512DQ-FCP-NEXT: vmovaps (%rdi), %zmm1 359; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 360; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 361; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 362; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 363; AVX512DQ-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 364; AVX512DQ-FCP-NEXT: vmovaps %xmm0, (%rsi) 365; AVX512DQ-FCP-NEXT: vmovaps %xmm2, (%rdx) 366; AVX512DQ-FCP-NEXT: vmovaps %xmm1, (%rcx) 367; AVX512DQ-FCP-NEXT: vzeroupper 368; AVX512DQ-FCP-NEXT: retq 369; 370; AVX512BW-LABEL: load_i32_stride3_vf4: 371; AVX512BW: # %bb.0: 372; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 373; AVX512BW-NEXT: vmovaps (%rdi), %zmm1 374; AVX512BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 375; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 376; AVX512BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 377; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 378; AVX512BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 379; AVX512BW-NEXT: vmovaps %xmm0, (%rsi) 380; AVX512BW-NEXT: vmovaps %xmm2, (%rdx) 381; AVX512BW-NEXT: vmovaps %xmm1, (%rcx) 382; AVX512BW-NEXT: vzeroupper 383; AVX512BW-NEXT: retq 384; 385; AVX512BW-FCP-LABEL: load_i32_stride3_vf4: 386; AVX512BW-FCP: # %bb.0: 387; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 388; AVX512BW-FCP-NEXT: vmovaps (%rdi), %zmm1 389; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 390; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 391; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 392; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 393; AVX512BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 394; AVX512BW-FCP-NEXT: vmovaps %xmm0, (%rsi) 395; AVX512BW-FCP-NEXT: vmovaps %xmm2, (%rdx) 396; AVX512BW-FCP-NEXT: vmovaps %xmm1, (%rcx) 397; AVX512BW-FCP-NEXT: vzeroupper 398; AVX512BW-FCP-NEXT: retq 399; 400; AVX512DQ-BW-LABEL: load_i32_stride3_vf4: 401; AVX512DQ-BW: # %bb.0: 402; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 403; AVX512DQ-BW-NEXT: vmovaps (%rdi), %zmm1 404; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm0, %zmm0 405; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 406; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm2, %zmm2 407; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 408; AVX512DQ-BW-NEXT: vpermps %zmm1, %zmm3, %zmm1 409; AVX512DQ-BW-NEXT: vmovaps %xmm0, (%rsi) 410; AVX512DQ-BW-NEXT: vmovaps %xmm2, (%rdx) 411; AVX512DQ-BW-NEXT: vmovaps %xmm1, (%rcx) 412; AVX512DQ-BW-NEXT: vzeroupper 413; AVX512DQ-BW-NEXT: retq 414; 415; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf4: 416; AVX512DQ-BW-FCP: # %bb.0: 417; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,3,6,9] 418; AVX512DQ-BW-FCP-NEXT: vmovaps (%rdi), %zmm1 419; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm0, %zmm0 420; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [1,4,7,10] 421; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm2, %zmm2 422; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [2,5,8,11] 423; AVX512DQ-BW-FCP-NEXT: vpermps %zmm1, %zmm3, %zmm1 424; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm0, (%rsi) 425; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm2, (%rdx) 426; AVX512DQ-BW-FCP-NEXT: vmovaps %xmm1, (%rcx) 427; AVX512DQ-BW-FCP-NEXT: vzeroupper 428; AVX512DQ-BW-FCP-NEXT: retq 429 %wide.vec = load <12 x i32>, ptr %in.vec, align 64 430 %strided.vec0 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9> 431 %strided.vec1 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10> 432 %strided.vec2 = shufflevector <12 x i32> %wide.vec, <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11> 433 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64 434 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64 435 store <4 x i32> %strided.vec2, ptr %out.vec2, align 64 436 ret void 437} 438 439define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { 440; SSE-LABEL: load_i32_stride3_vf8: 441; SSE: # %bb.0: 442; SSE-NEXT: movaps 80(%rdi), %xmm1 443; SSE-NEXT: movaps 64(%rdi), %xmm5 444; SSE-NEXT: movdqa (%rdi), %xmm0 445; SSE-NEXT: movaps 16(%rdi), %xmm7 446; SSE-NEXT: movaps 32(%rdi), %xmm4 447; SSE-NEXT: movdqa 48(%rdi), %xmm2 448; SSE-NEXT: movdqa %xmm0, %xmm3 449; SSE-NEXT: movaps %xmm7, %xmm8 450; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3] 451; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm7[0,0] 452; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm7[1,1,1,1] 453; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[2,0],xmm4[1,0] 454; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,2] 455; SSE-NEXT: movdqa %xmm2, %xmm7 456; SSE-NEXT: movaps %xmm5, %xmm10 457; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm2[2,3,2,3] 458; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] 459; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm5[1,1,1,1] 460; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[2,0],xmm1[1,0] 461; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm5[0,2] 462; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,1],xmm1[2,3] 463; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm10[0,2] 464; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm4[2,3] 465; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm8[0,2] 466; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1] 467; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,3] 468; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1] 469; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm1[0,3] 470; SSE-NEXT: movaps %xmm7, 16(%rsi) 471; SSE-NEXT: movaps %xmm3, (%rsi) 472; SSE-NEXT: movaps %xmm2, 16(%rdx) 473; SSE-NEXT: movaps %xmm0, (%rdx) 474; SSE-NEXT: movaps %xmm11, 16(%rcx) 475; SSE-NEXT: movaps %xmm6, (%rcx) 476; SSE-NEXT: retq 477; 478; AVX-LABEL: load_i32_stride3_vf8: 479; AVX: # %bb.0: 480; AVX-NEXT: vmovaps 64(%rdi), %ymm0 481; AVX-NEXT: vmovaps 32(%rdi), %ymm1 482; AVX-NEXT: vmovaps (%rdi), %ymm2 483; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 484; AVX-NEXT: vmovaps 16(%rdi), %xmm4 485; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,1],ymm1[1,3],ymm4[6,5],ymm1[5,7] 486; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,3],ymm4[0,2],ymm3[4,7],ymm4[4,6] 487; AVX-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[2,3,0,1] 488; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[1,0],ymm4[2,0],ymm0[5,4],ymm4[6,4] 489; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,1,2,0,4,5,6,4] 490; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7] 491; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm0[2,0],ymm4[3,0],ymm0[6,4],ymm4[7,4] 492; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[0,0],ymm5[2,0],ymm4[4,4],ymm5[6,4] 493; AVX-NEXT: vmovaps 16(%rdi), %xmm6 494; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 495; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[1,2],ymm6[0,3],ymm7[5,6],ymm6[4,7] 496; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,2,3,1,4,6,7,5] 497; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5,6,7] 498; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 499; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[1,0],ymm2[2,0],ymm6[5,4],ymm2[6,4] 500; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] 501; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[0,1],ymm0[0,3],ymm4[4,5],ymm0[4,7] 502; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 503; AVX-NEXT: vmovaps %ymm3, (%rsi) 504; AVX-NEXT: vmovaps %ymm5, (%rdx) 505; AVX-NEXT: vmovaps %ymm0, (%rcx) 506; AVX-NEXT: vzeroupper 507; AVX-NEXT: retq 508; 509; AVX2-LABEL: load_i32_stride3_vf8: 510; AVX2: # %bb.0: 511; AVX2-NEXT: vmovaps (%rdi), %ymm0 512; AVX2-NEXT: vmovaps 32(%rdi), %ymm1 513; AVX2-NEXT: vmovaps 64(%rdi), %ymm2 514; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 515; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 516; AVX2-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5] 517; AVX2-NEXT: vpermps %ymm3, %ymm4, %ymm3 518; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 519; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] 520; AVX2-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6] 521; AVX2-NEXT: vpermps %ymm4, %ymm5, %ymm4 522; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 523; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u] 524; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 525; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] 526; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 527; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 528; AVX2-NEXT: vmovaps %ymm3, (%rsi) 529; AVX2-NEXT: vmovaps %ymm4, (%rdx) 530; AVX2-NEXT: vmovaps %ymm0, (%rcx) 531; AVX2-NEXT: vzeroupper 532; AVX2-NEXT: retq 533; 534; AVX2-FP-LABEL: load_i32_stride3_vf8: 535; AVX2-FP: # %bb.0: 536; AVX2-FP-NEXT: vmovaps (%rdi), %ymm0 537; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm1 538; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2 539; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 540; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 541; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5] 542; AVX2-FP-NEXT: vpermps %ymm3, %ymm4, %ymm3 543; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 544; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] 545; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6] 546; AVX2-FP-NEXT: vpermps %ymm4, %ymm5, %ymm4 547; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 548; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u] 549; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0 550; AVX2-FP-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7] 551; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 552; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 553; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) 554; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx) 555; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) 556; AVX2-FP-NEXT: vzeroupper 557; AVX2-FP-NEXT: retq 558; 559; AVX2-FCP-LABEL: load_i32_stride3_vf8: 560; AVX2-FCP: # %bb.0: 561; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm0 562; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 563; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2 564; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 565; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 566; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm4 = [0,3,6,1,4,7,2,5] 567; AVX2-FCP-NEXT: vpermps %ymm3, %ymm4, %ymm3 568; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 569; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0],ymm4[1,2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7] 570; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm5 = [1,4,7,2,5,0,3,6] 571; AVX2-FCP-NEXT: vpermps %ymm4, %ymm5, %ymm4 572; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 573; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7] 574; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,1,4,7] 575; AVX2-FCP-NEXT: vpermps %ymm0, %ymm1, %ymm0 576; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi) 577; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx) 578; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx) 579; AVX2-FCP-NEXT: vzeroupper 580; AVX2-FCP-NEXT: retq 581; 582; AVX512-LABEL: load_i32_stride3_vf8: 583; AVX512: # %bb.0: 584; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 585; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 586; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 587; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 588; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 589; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 590; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 591; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 592; AVX512-NEXT: vmovdqa %ymm2, (%rsi) 593; AVX512-NEXT: vmovdqa %ymm3, (%rdx) 594; AVX512-NEXT: vmovdqa %ymm4, (%rcx) 595; AVX512-NEXT: vzeroupper 596; AVX512-NEXT: retq 597; 598; AVX512-FCP-LABEL: load_i32_stride3_vf8: 599; AVX512-FCP: # %bb.0: 600; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 601; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 602; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 603; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 604; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 605; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 606; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 607; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 608; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rsi) 609; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rdx) 610; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rcx) 611; AVX512-FCP-NEXT: vzeroupper 612; AVX512-FCP-NEXT: retq 613; 614; AVX512DQ-LABEL: load_i32_stride3_vf8: 615; AVX512DQ: # %bb.0: 616; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 617; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 618; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 619; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 620; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 621; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 622; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 623; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 624; AVX512DQ-NEXT: vmovdqa %ymm2, (%rsi) 625; AVX512DQ-NEXT: vmovdqa %ymm3, (%rdx) 626; AVX512DQ-NEXT: vmovdqa %ymm4, (%rcx) 627; AVX512DQ-NEXT: vzeroupper 628; AVX512DQ-NEXT: retq 629; 630; AVX512DQ-FCP-LABEL: load_i32_stride3_vf8: 631; AVX512DQ-FCP: # %bb.0: 632; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 633; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 634; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 635; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 636; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 637; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 638; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 639; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 640; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rsi) 641; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rdx) 642; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rcx) 643; AVX512DQ-FCP-NEXT: vzeroupper 644; AVX512DQ-FCP-NEXT: retq 645; 646; AVX512BW-LABEL: load_i32_stride3_vf8: 647; AVX512BW: # %bb.0: 648; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 649; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 650; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 651; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 652; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 653; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 654; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 655; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 656; AVX512BW-NEXT: vmovdqa %ymm2, (%rsi) 657; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx) 658; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx) 659; AVX512BW-NEXT: vzeroupper 660; AVX512BW-NEXT: retq 661; 662; AVX512BW-FCP-LABEL: load_i32_stride3_vf8: 663; AVX512BW-FCP: # %bb.0: 664; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 665; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 666; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 667; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 668; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 669; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 670; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 671; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 672; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) 673; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) 674; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx) 675; AVX512BW-FCP-NEXT: vzeroupper 676; AVX512BW-FCP-NEXT: retq 677; 678; AVX512DQ-BW-LABEL: load_i32_stride3_vf8: 679; AVX512DQ-BW: # %bb.0: 680; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 681; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 682; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 683; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 684; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 685; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 686; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 687; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 688; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rsi) 689; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx) 690; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx) 691; AVX512DQ-BW-NEXT: vzeroupper 692; AVX512DQ-BW-NEXT: retq 693; 694; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf8: 695; AVX512DQ-BW-FCP: # %bb.0: 696; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 697; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 698; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,3,6,9,12,15,18,21] 699; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2 700; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,4,7,10,13,16,19,22] 701; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 702; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [2,5,8,11,14,17,20,23] 703; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm4 704; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rsi) 705; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx) 706; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx) 707; AVX512DQ-BW-FCP-NEXT: vzeroupper 708; AVX512DQ-BW-FCP-NEXT: retq 709 %wide.vec = load <24 x i32>, ptr %in.vec, align 64 710 %strided.vec0 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21> 711 %strided.vec1 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22> 712 %strided.vec2 = shufflevector <24 x i32> %wide.vec, <24 x i32> poison, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23> 713 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64 714 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64 715 store <8 x i32> %strided.vec2, ptr %out.vec2, align 64 716 ret void 717} 718 719define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { 720; SSE-LABEL: load_i32_stride3_vf16: 721; SSE: # %bb.0: 722; SSE-NEXT: movaps 96(%rdi), %xmm6 723; SSE-NEXT: movaps 128(%rdi), %xmm12 724; SSE-NEXT: movaps 112(%rdi), %xmm13 725; SSE-NEXT: movaps 144(%rdi), %xmm11 726; SSE-NEXT: movaps 176(%rdi), %xmm10 727; SSE-NEXT: movaps 160(%rdi), %xmm9 728; SSE-NEXT: movaps (%rdi), %xmm7 729; SSE-NEXT: movaps 16(%rdi), %xmm8 730; SSE-NEXT: movaps 32(%rdi), %xmm3 731; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 732; SSE-NEXT: movaps 48(%rdi), %xmm15 733; SSE-NEXT: movaps 80(%rdi), %xmm14 734; SSE-NEXT: movaps 64(%rdi), %xmm2 735; SSE-NEXT: movaps %xmm2, %xmm0 736; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm14[1,0] 737; SSE-NEXT: movaps %xmm15, %xmm5 738; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] 739; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 740; SSE-NEXT: movaps %xmm8, %xmm0 741; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] 742; SSE-NEXT: movaps %xmm7, %xmm5 743; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] 744; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 745; SSE-NEXT: movaps %xmm9, %xmm0 746; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] 747; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 748; SSE-NEXT: movaps %xmm11, %xmm3 749; SSE-NEXT: movaps %xmm11, %xmm4 750; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 751; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,3],xmm0[0,2] 752; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 753; SSE-NEXT: movaps %xmm13, %xmm0 754; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0] 755; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 756; SSE-NEXT: movaps %xmm6, %xmm5 757; SSE-NEXT: movaps %xmm6, %xmm3 758; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,2] 759; SSE-NEXT: movaps %xmm15, %xmm11 760; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm2[0,0] 761; SSE-NEXT: movaps %xmm2, %xmm0 762; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm14[2,3] 763; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] 764; SSE-NEXT: movaps %xmm4, %xmm6 765; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[1,0],xmm9[0,0] 766; SSE-NEXT: movaps %xmm9, %xmm0 767; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] 768; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm0[0,2] 769; SSE-NEXT: movaps %xmm3, %xmm1 770; SSE-NEXT: movaps %xmm3, %xmm10 771; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,0],xmm13[0,0] 772; SSE-NEXT: movaps %xmm13, %xmm0 773; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] 774; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[0,2] 775; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,2,3] 776; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm8[0,0] 777; SSE-NEXT: movaps %xmm8, %xmm12 778; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 779; SSE-NEXT: shufps {{.*#+}} xmm12 = xmm12[3,1],xmm4[2,3] 780; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm12[0,2] 781; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[1,1,1,1] 782; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm15[2,3,2,3] 783; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1] 784; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm14[0,3] 785; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm8[1,1,1,1] 786; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 787; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm4[0,3] 788; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm9[1,1,1,1] 789; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 790; SSE-NEXT: # xmm8 = mem[2,3,2,3] 791; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1] 792; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 793; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] 794; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[1,1,1,1] 795; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm10[2,3,2,3] 796; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 797; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 798; SSE-NEXT: # xmm4 = xmm4[0,1],mem[0,3] 799; SSE-NEXT: movaps %xmm5, 32(%rsi) 800; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 801; SSE-NEXT: movaps %xmm3, 48(%rsi) 802; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 803; SSE-NEXT: movaps %xmm3, (%rsi) 804; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 805; SSE-NEXT: movaps %xmm3, 16(%rsi) 806; SSE-NEXT: movaps %xmm1, 32(%rdx) 807; SSE-NEXT: movaps %xmm6, 48(%rdx) 808; SSE-NEXT: movaps %xmm7, (%rdx) 809; SSE-NEXT: movaps %xmm11, 16(%rdx) 810; SSE-NEXT: movaps %xmm4, 32(%rcx) 811; SSE-NEXT: movaps %xmm8, 48(%rcx) 812; SSE-NEXT: movaps %xmm0, (%rcx) 813; SSE-NEXT: movaps %xmm2, 16(%rcx) 814; SSE-NEXT: retq 815; 816; AVX-LABEL: load_i32_stride3_vf16: 817; AVX: # %bb.0: 818; AVX-NEXT: vmovaps 160(%rdi), %ymm0 819; AVX-NEXT: vmovaps 128(%rdi), %ymm1 820; AVX-NEXT: vmovaps 96(%rdi), %ymm2 821; AVX-NEXT: vmovaps 64(%rdi), %ymm3 822; AVX-NEXT: vmovaps 32(%rdi), %ymm4 823; AVX-NEXT: vmovaps (%rdi), %ymm6 824; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] 825; AVX-NEXT: vmovaps 16(%rdi), %xmm7 826; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,1],ymm4[1,3],ymm7[6,5],ymm4[5,7] 827; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[0,3],ymm7[0,2],ymm5[4,7],ymm7[4,6] 828; AVX-NEXT: vperm2f128 {{.*#+}} ymm7 = ymm3[2,3,0,1] 829; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm3[1,0],ymm7[2,0],ymm3[5,4],ymm7[6,4] 830; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] 831; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm8[6,7] 832; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6],ymm1[7] 833; AVX-NEXT: vmovaps 112(%rdi), %xmm9 834; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,1],ymm1[1,3],ymm9[6,5],ymm1[5,7] 835; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,3],ymm9[0,2],ymm8[4,7],ymm9[4,6] 836; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm0[2,3,0,1] 837; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm0[1,0],ymm9[2,0],ymm0[5,4],ymm9[6,4] 838; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4] 839; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7] 840; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm3[2,0],ymm7[3,0],ymm3[6,4],ymm7[7,4] 841; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm7[0,0],ymm10[2,0],ymm7[4,4],ymm10[6,4] 842; AVX-NEXT: vmovaps 16(%rdi), %xmm11 843; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] 844; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[1,2],ymm11[0,3],ymm12[5,6],ymm11[4,7] 845; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,2,3,1,4,6,7,5] 846; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5,6,7] 847; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm0[2,0],ymm9[3,0],ymm0[6,4],ymm9[7,4] 848; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm9[0,0],ymm12[2,0],ymm9[4,4],ymm12[6,4] 849; AVX-NEXT: vmovaps 112(%rdi), %xmm13 850; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 851; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm13[0,3],ymm14[5,6],ymm13[4,7] 852; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] 853; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3,4],ymm12[5,6,7] 854; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm4[0,1],ymm6[2],ymm4[3,4],ymm6[5],ymm4[6,7] 855; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm11[1,0],ymm6[2,0],ymm11[5,4],ymm6[6,4] 856; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm6[2,0],ymm4[0,3],ymm6[6,4],ymm4[4,7] 857; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm7[0,1],ymm3[0,3],ymm7[4,5],ymm3[4,7] 858; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5,6,7] 859; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 860; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm13[1,0],ymm2[2,0],ymm13[5,4],ymm2[6,4] 861; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[0,3],ymm2[6,4],ymm1[4,7] 862; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm9[0,1],ymm0[0,3],ymm9[4,5],ymm0[4,7] 863; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 864; AVX-NEXT: vmovaps %ymm8, 32(%rsi) 865; AVX-NEXT: vmovaps %ymm5, (%rsi) 866; AVX-NEXT: vmovaps %ymm12, 32(%rdx) 867; AVX-NEXT: vmovaps %ymm10, (%rdx) 868; AVX-NEXT: vmovaps %ymm0, 32(%rcx) 869; AVX-NEXT: vmovaps %ymm3, (%rcx) 870; AVX-NEXT: vzeroupper 871; AVX-NEXT: retq 872; 873; AVX2-LABEL: load_i32_stride3_vf16: 874; AVX2: # %bb.0: 875; AVX2-NEXT: vmovaps 160(%rdi), %ymm0 876; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 877; AVX2-NEXT: vmovaps (%rdi), %ymm2 878; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 879; AVX2-NEXT: vmovaps 64(%rdi), %ymm4 880; AVX2-NEXT: vmovaps 96(%rdi), %ymm5 881; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] 882; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] 883; AVX2-NEXT: vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5] 884; AVX2-NEXT: vpermps %ymm6, %ymm7, %ymm6 885; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] 886; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] 887; AVX2-NEXT: vpermps %ymm8, %ymm7, %ymm7 888; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 889; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] 890; AVX2-NEXT: vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6] 891; AVX2-NEXT: vpermps %ymm8, %ymm9, %ymm8 892; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] 893; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] 894; AVX2-NEXT: vpermps %ymm10, %ymm9, %ymm9 895; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 896; AVX2-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u] 897; AVX2-NEXT: vpermps %ymm2, %ymm3, %ymm2 898; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] 899; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 900; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] 901; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] 902; AVX2-NEXT: vpermps %ymm1, %ymm3, %ymm1 903; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] 904; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] 905; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 906; AVX2-NEXT: vmovaps %ymm7, 32(%rsi) 907; AVX2-NEXT: vmovaps %ymm6, (%rsi) 908; AVX2-NEXT: vmovaps %ymm9, 32(%rdx) 909; AVX2-NEXT: vmovaps %ymm8, (%rdx) 910; AVX2-NEXT: vmovaps %ymm0, 32(%rcx) 911; AVX2-NEXT: vmovaps %ymm2, (%rcx) 912; AVX2-NEXT: vzeroupper 913; AVX2-NEXT: retq 914; 915; AVX2-FP-LABEL: load_i32_stride3_vf16: 916; AVX2-FP: # %bb.0: 917; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm0 918; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 919; AVX2-FP-NEXT: vmovaps (%rdi), %ymm2 920; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3 921; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm4 922; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5 923; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] 924; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] 925; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5] 926; AVX2-FP-NEXT: vpermps %ymm6, %ymm7, %ymm6 927; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] 928; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] 929; AVX2-FP-NEXT: vpermps %ymm8, %ymm7, %ymm7 930; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 931; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] 932; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6] 933; AVX2-FP-NEXT: vpermps %ymm8, %ymm9, %ymm8 934; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] 935; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] 936; AVX2-FP-NEXT: vpermps %ymm10, %ymm9, %ymm9 937; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 938; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u] 939; AVX2-FP-NEXT: vpermps %ymm2, %ymm3, %ymm2 940; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] 941; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 942; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7] 943; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] 944; AVX2-FP-NEXT: vpermps %ymm1, %ymm3, %ymm1 945; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,3,4,5,4,7] 946; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,1,0,3] 947; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 948; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi) 949; AVX2-FP-NEXT: vmovaps %ymm6, (%rsi) 950; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rdx) 951; AVX2-FP-NEXT: vmovaps %ymm8, (%rdx) 952; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx) 953; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx) 954; AVX2-FP-NEXT: vzeroupper 955; AVX2-FP-NEXT: retq 956; 957; AVX2-FCP-LABEL: load_i32_stride3_vf16: 958; AVX2-FCP: # %bb.0: 959; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm0 960; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1 961; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm2 962; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm3 963; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm4 964; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5 965; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7] 966; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm4[2],ymm6[3,4],ymm4[5],ymm6[6,7] 967; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm7 = [0,3,6,1,4,7,2,5] 968; AVX2-FCP-NEXT: vpermps %ymm6, %ymm7, %ymm6 969; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] 970; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm0[2],ymm8[3,4],ymm0[5],ymm8[6,7] 971; AVX2-FCP-NEXT: vpermps %ymm8, %ymm7, %ymm7 972; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7] 973; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm4[0],ymm8[1,2],ymm4[3],ymm8[4,5],ymm4[6],ymm8[7] 974; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm9 = [1,4,7,2,5,0,3,6] 975; AVX2-FCP-NEXT: vpermps %ymm8, %ymm9, %ymm8 976; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] 977; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm0[0],ymm10[1,2],ymm0[3],ymm10[4,5],ymm0[6],ymm10[7] 978; AVX2-FCP-NEXT: vpermps %ymm10, %ymm9, %ymm9 979; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7] 980; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] 981; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,1,4,7] 982; AVX2-FCP-NEXT: vpermps %ymm2, %ymm3, %ymm2 983; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] 984; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 985; AVX2-FCP-NEXT: vpermps %ymm0, %ymm3, %ymm0 986; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rsi) 987; AVX2-FCP-NEXT: vmovaps %ymm6, (%rsi) 988; AVX2-FCP-NEXT: vmovaps %ymm9, 32(%rdx) 989; AVX2-FCP-NEXT: vmovaps %ymm8, (%rdx) 990; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) 991; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx) 992; AVX2-FCP-NEXT: vzeroupper 993; AVX2-FCP-NEXT: retq 994; 995; AVX512-LABEL: load_i32_stride3_vf16: 996; AVX512: # %bb.0: 997; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0 998; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1 999; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2 1000; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1001; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1002; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1003; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1004; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1005; AVX512-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1006; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1007; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1008; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1009; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1010; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1011; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1012; AVX512-NEXT: vmovdqa64 %zmm4, (%rsi) 1013; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx) 1014; AVX512-NEXT: vmovdqa64 %zmm0, (%rcx) 1015; AVX512-NEXT: vzeroupper 1016; AVX512-NEXT: retq 1017; 1018; AVX512-FCP-LABEL: load_i32_stride3_vf16: 1019; AVX512-FCP: # %bb.0: 1020; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1021; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1022; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1023; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1024; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1025; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1026; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1027; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1028; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1029; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1030; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1031; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1032; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1033; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1034; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1035; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) 1036; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) 1037; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) 1038; AVX512-FCP-NEXT: vzeroupper 1039; AVX512-FCP-NEXT: retq 1040; 1041; AVX512DQ-LABEL: load_i32_stride3_vf16: 1042; AVX512DQ: # %bb.0: 1043; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0 1044; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1 1045; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2 1046; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1047; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1048; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1049; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1050; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1051; AVX512DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1052; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1053; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1054; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1055; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1056; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1057; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1058; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rsi) 1059; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx) 1060; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rcx) 1061; AVX512DQ-NEXT: vzeroupper 1062; AVX512DQ-NEXT: retq 1063; 1064; AVX512DQ-FCP-LABEL: load_i32_stride3_vf16: 1065; AVX512DQ-FCP: # %bb.0: 1066; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1067; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1068; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1069; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1070; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1071; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1072; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1073; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1074; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1075; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1076; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1077; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1078; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1079; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1080; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1081; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) 1082; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) 1083; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) 1084; AVX512DQ-FCP-NEXT: vzeroupper 1085; AVX512DQ-FCP-NEXT: retq 1086; 1087; AVX512BW-LABEL: load_i32_stride3_vf16: 1088; AVX512BW: # %bb.0: 1089; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0 1090; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1091; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2 1092; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1093; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1094; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1095; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1096; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1097; AVX512BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1098; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1099; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1100; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1101; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1102; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1103; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1104; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi) 1105; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx) 1106; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1107; AVX512BW-NEXT: vzeroupper 1108; AVX512BW-NEXT: retq 1109; 1110; AVX512BW-FCP-LABEL: load_i32_stride3_vf16: 1111; AVX512BW-FCP: # %bb.0: 1112; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1113; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1114; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1115; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1116; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1117; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1118; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1119; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1120; AVX512BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1121; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1122; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1123; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1124; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1125; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1126; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1127; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) 1128; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) 1129; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) 1130; AVX512BW-FCP-NEXT: vzeroupper 1131; AVX512BW-FCP-NEXT: retq 1132; 1133; AVX512DQ-BW-LABEL: load_i32_stride3_vf16: 1134; AVX512DQ-BW: # %bb.0: 1135; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0 1136; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1 1137; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2 1138; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1139; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1140; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1141; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1142; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1143; AVX512DQ-BW-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1144; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1145; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1146; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1147; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1148; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1149; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1150; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi) 1151; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx) 1152; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rcx) 1153; AVX512DQ-BW-NEXT: vzeroupper 1154; AVX512DQ-BW-NEXT: retq 1155; 1156; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf16: 1157; AVX512DQ-BW-FCP: # %bb.0: 1158; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0 1159; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1 1160; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2 1161; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1162; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1163; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1164; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm4 1165; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1166; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm0, %zmm1, %zmm3 1167; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1168; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm5 1169; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1170; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm3 1171; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1172; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm0 1173; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi) 1174; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx) 1175; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rcx) 1176; AVX512DQ-BW-FCP-NEXT: vzeroupper 1177; AVX512DQ-BW-FCP-NEXT: retq 1178 %wide.vec = load <48 x i32>, ptr %in.vec, align 64 1179 %strided.vec0 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 1180 %strided.vec1 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46> 1181 %strided.vec2 = shufflevector <48 x i32> %wide.vec, <48 x i32> poison, <16 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47> 1182 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64 1183 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64 1184 store <16 x i32> %strided.vec2, ptr %out.vec2, align 64 1185 ret void 1186} 1187 1188define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { 1189; SSE-LABEL: load_i32_stride3_vf32: 1190; SSE: # %bb.0: 1191; SSE-NEXT: subq $392, %rsp # imm = 0x188 1192; SSE-NEXT: movaps 192(%rdi), %xmm4 1193; SSE-NEXT: movaps 224(%rdi), %xmm3 1194; SSE-NEXT: movaps 208(%rdi), %xmm14 1195; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1196; SSE-NEXT: movaps 240(%rdi), %xmm7 1197; SSE-NEXT: movaps 272(%rdi), %xmm10 1198; SSE-NEXT: movaps 256(%rdi), %xmm9 1199; SSE-NEXT: movaps (%rdi), %xmm13 1200; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1201; SSE-NEXT: movaps 16(%rdi), %xmm8 1202; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1203; SSE-NEXT: movaps 32(%rdi), %xmm11 1204; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1205; SSE-NEXT: movaps 48(%rdi), %xmm2 1206; SSE-NEXT: movaps 80(%rdi), %xmm12 1207; SSE-NEXT: movaps 64(%rdi), %xmm5 1208; SSE-NEXT: movaps %xmm5, %xmm0 1209; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm12[1,0] 1210; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1211; SSE-NEXT: movaps %xmm2, %xmm1 1212; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1213; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1214; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1215; SSE-NEXT: movaps %xmm9, %xmm0 1216; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] 1217; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1218; SSE-NEXT: movaps %xmm7, %xmm1 1219; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1220; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1221; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1222; SSE-NEXT: movaps %xmm8, %xmm0 1223; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm11[1,0] 1224; SSE-NEXT: movaps %xmm13, %xmm1 1225; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1226; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1227; SSE-NEXT: movaps %xmm14, %xmm0 1228; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] 1229; SSE-NEXT: movaps %xmm3, %xmm13 1230; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1231; SSE-NEXT: movaps %xmm4, %xmm1 1232; SSE-NEXT: movaps %xmm4, %xmm11 1233; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1234; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1235; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1236; SSE-NEXT: movaps 176(%rdi), %xmm1 1237; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1238; SSE-NEXT: movaps 160(%rdi), %xmm0 1239; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1240; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 1241; SSE-NEXT: movaps 144(%rdi), %xmm3 1242; SSE-NEXT: movaps %xmm3, %xmm1 1243; SSE-NEXT: movaps %xmm3, (%rsp) # 16-byte Spill 1244; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1245; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1246; SSE-NEXT: movaps 368(%rdi), %xmm1 1247; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1248; SSE-NEXT: movaps 352(%rdi), %xmm0 1249; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1250; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 1251; SSE-NEXT: movaps 336(%rdi), %xmm4 1252; SSE-NEXT: movaps %xmm4, %xmm1 1253; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1254; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1255; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1256; SSE-NEXT: movaps 128(%rdi), %xmm1 1257; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1258; SSE-NEXT: movaps 112(%rdi), %xmm15 1259; SSE-NEXT: movaps %xmm15, %xmm0 1260; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 1261; SSE-NEXT: movaps 96(%rdi), %xmm1 1262; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1263; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1264; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1265; SSE-NEXT: movaps 320(%rdi), %xmm1 1266; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1267; SSE-NEXT: movaps 304(%rdi), %xmm6 1268; SSE-NEXT: movaps %xmm6, %xmm0 1269; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 1270; SSE-NEXT: movaps 288(%rdi), %xmm8 1271; SSE-NEXT: movaps %xmm8, %xmm1 1272; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1273; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 1274; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1275; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1276; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm5[0,0] 1277; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[3,1],xmm12[2,3] 1278; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2] 1279; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1280; SSE-NEXT: movaps %xmm7, %xmm14 1281; SSE-NEXT: movaps %xmm9, %xmm0 1282; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1283; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm9[0,0] 1284; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm10[2,3] 1285; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm0[0,2] 1286; SSE-NEXT: movaps %xmm11, %xmm10 1287; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1288; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[1,0],xmm0[0,0] 1289; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm13[2,3] 1290; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,2],xmm0[0,2] 1291; SSE-NEXT: movaps %xmm3, %xmm9 1292; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1293; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm0[0,0] 1294; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1295; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] 1296; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm0[0,2] 1297; SSE-NEXT: movaps %xmm4, %xmm7 1298; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1299; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm0[0,0] 1300; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1301; SSE-NEXT: # xmm0 = xmm0[3,1],mem[2,3] 1302; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2] 1303; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 1304; SSE-NEXT: movaps %xmm4, %xmm11 1305; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[1,0],xmm15[0,0] 1306; SSE-NEXT: movaps %xmm15, %xmm0 1307; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload 1308; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1],xmm12[2,3] 1309; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm0[0,2] 1310; SSE-NEXT: movaps %xmm8, %xmm3 1311; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1312; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm6[0,0] 1313; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 1314; SSE-NEXT: # xmm6 = xmm6[3,1],mem[2,3] 1315; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2] 1316; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1317; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 1318; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1319; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] 1320; SSE-NEXT: movaps %xmm0, %xmm13 1321; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1322; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[3,1],xmm1[2,3] 1323; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm13[0,2] 1324; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1325; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 1326; SSE-NEXT: # xmm13 = mem[1,1,1,1] 1327; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 1328; SSE-NEXT: # xmm8 = mem[2,3,2,3] 1329; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1] 1330; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 1331; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] 1332; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[1,1,1,1] 1333; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1] 1334; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[0,3] 1335; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 1336; SSE-NEXT: # xmm13 = mem[1,1,1,1] 1337; SSE-NEXT: pshufd $238, (%rsp), %xmm6 # 16-byte Folded Reload 1338; SSE-NEXT: # xmm6 = mem[2,3,2,3] 1339; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1] 1340; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 1341; SSE-NEXT: # xmm6 = xmm6[0,1],mem[0,3] 1342; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm15[1,1,1,1] 1343; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] 1344; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1] 1345; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm12[0,3] 1346; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 1347; SSE-NEXT: # xmm15 = mem[1,1,1,1] 1348; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 1349; SSE-NEXT: # xmm13 = mem[2,3,2,3] 1350; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1] 1351; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 1352; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] 1353; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1354; SSE-NEXT: # xmm0 = mem[1,1,1,1] 1355; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 1356; SSE-NEXT: # xmm15 = mem[2,3,2,3] 1357; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 1358; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 1359; SSE-NEXT: # xmm15 = xmm15[0,1],mem[0,3] 1360; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1361; SSE-NEXT: # xmm0 = mem[1,1,1,1] 1362; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1363; SSE-NEXT: # xmm1 = mem[2,3,2,3] 1364; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 1365; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 1366; SSE-NEXT: # xmm1 = xmm1[0,1],mem[0,3] 1367; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 1368; SSE-NEXT: # xmm2 = mem[1,1,1,1] 1369; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1370; SSE-NEXT: # xmm0 = mem[2,3,2,3] 1371; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1372; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1373; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] 1374; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1375; SSE-NEXT: movaps %xmm2, 96(%rsi) 1376; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1377; SSE-NEXT: movaps %xmm2, 32(%rsi) 1378; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1379; SSE-NEXT: movaps %xmm2, 112(%rsi) 1380; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1381; SSE-NEXT: movaps %xmm2, 48(%rsi) 1382; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1383; SSE-NEXT: movaps %xmm2, 64(%rsi) 1384; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1385; SSE-NEXT: movaps %xmm2, (%rsi) 1386; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1387; SSE-NEXT: movaps %xmm2, 80(%rsi) 1388; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1389; SSE-NEXT: movaps %xmm2, 16(%rsi) 1390; SSE-NEXT: movaps %xmm3, 96(%rdx) 1391; SSE-NEXT: movaps %xmm11, 32(%rdx) 1392; SSE-NEXT: movaps %xmm7, 112(%rdx) 1393; SSE-NEXT: movaps %xmm9, 48(%rdx) 1394; SSE-NEXT: movaps %xmm10, 64(%rdx) 1395; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1396; SSE-NEXT: movaps %xmm2, (%rdx) 1397; SSE-NEXT: movaps %xmm14, 80(%rdx) 1398; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1399; SSE-NEXT: movaps %xmm2, 16(%rdx) 1400; SSE-NEXT: movaps %xmm0, 96(%rcx) 1401; SSE-NEXT: movaps %xmm1, 112(%rcx) 1402; SSE-NEXT: movaps %xmm15, 64(%rcx) 1403; SSE-NEXT: movaps %xmm13, 80(%rcx) 1404; SSE-NEXT: movaps %xmm4, 32(%rcx) 1405; SSE-NEXT: movaps %xmm6, 48(%rcx) 1406; SSE-NEXT: movaps %xmm5, (%rcx) 1407; SSE-NEXT: movaps %xmm8, 16(%rcx) 1408; SSE-NEXT: addq $392, %rsp # imm = 0x188 1409; SSE-NEXT: retq 1410; 1411; AVX-LABEL: load_i32_stride3_vf32: 1412; AVX: # %bb.0: 1413; AVX-NEXT: subq $392, %rsp # imm = 0x188 1414; AVX-NEXT: vmovaps 256(%rdi), %ymm2 1415; AVX-NEXT: vmovaps 224(%rdi), %ymm7 1416; AVX-NEXT: vmovaps 192(%rdi), %ymm3 1417; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1418; AVX-NEXT: vmovaps 352(%rdi), %ymm4 1419; AVX-NEXT: vmovaps 320(%rdi), %ymm5 1420; AVX-NEXT: vmovaps 288(%rdi), %ymm6 1421; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1422; AVX-NEXT: vmovaps 160(%rdi), %ymm10 1423; AVX-NEXT: vmovaps 128(%rdi), %ymm9 1424; AVX-NEXT: vmovaps 96(%rdi), %ymm0 1425; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1426; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] 1427; AVX-NEXT: vmovaps 112(%rdi), %xmm1 1428; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] 1429; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] 1430; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm10[2,3,0,1] 1431; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm11[2,0],ymm10[5,4],ymm11[6,4] 1432; AVX-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1433; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 1434; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1435; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1436; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6],ymm5[7] 1437; AVX-NEXT: vmovaps 304(%rdi), %xmm1 1438; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm5[1,3],ymm1[6,5],ymm5[5,7] 1439; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] 1440; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm4[2,3,0,1] 1441; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] 1442; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1443; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 1444; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1445; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1446; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1447; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0],ymm7[1],ymm3[2,3],ymm7[4],ymm3[5,6],ymm7[7] 1448; AVX-NEXT: vmovaps 208(%rdi), %xmm1 1449; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm7[1,3],ymm1[6,5],ymm7[5,7] 1450; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] 1451; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1452; AVX-NEXT: vperm2f128 {{.*#+}} ymm12 = ymm2[2,3,0,1] 1453; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm12[2,0],ymm2[5,4],ymm12[6,4] 1454; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 1455; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 1456; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1457; AVX-NEXT: vmovaps 32(%rdi), %ymm15 1458; AVX-NEXT: vmovaps 16(%rdi), %xmm0 1459; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm15[1,3],ymm0[6,5],ymm15[5,7] 1460; AVX-NEXT: vmovaps (%rdi), %ymm2 1461; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0],ymm15[1],ymm2[2,3],ymm15[4],ymm2[5,6],ymm15[7] 1462; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1463; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] 1464; AVX-NEXT: vmovaps 64(%rdi), %ymm7 1465; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm7[2,3,0,1] 1466; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm7[1,0],ymm0[2,0],ymm7[5,4],ymm0[6,4] 1467; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4] 1468; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm8[6,7] 1469; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1470; AVX-NEXT: vmovups %ymm11, (%rsp) # 32-byte Spill 1471; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm10[2,0],ymm11[3,0],ymm10[6,4],ymm11[7,4] 1472; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm11[0,0],ymm8[2,0],ymm11[4,4],ymm8[6,4] 1473; AVX-NEXT: vmovaps 112(%rdi), %xmm13 1474; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload 1475; AVX-NEXT: # ymm6 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] 1476; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm13[0,3],ymm6[5,6],ymm13[4,7] 1477; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] 1478; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm8[5,6,7] 1479; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1480; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm4[2,0],ymm14[3,0],ymm4[6,4],ymm14[7,4] 1481; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm14[0,0],ymm6[2,0],ymm14[4,4],ymm6[6,4] 1482; AVX-NEXT: vmovaps 304(%rdi), %xmm8 1483; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 1484; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] 1485; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,2],ymm8[0,3],ymm10[5,6],ymm8[4,7] 1486; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,2,3,1,4,6,7,5] 1487; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0,1,2,3,4],ymm6[5,6,7] 1488; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1489; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,0],ymm0[3,0],ymm7[6,4],ymm0[7,4] 1490; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm0[0,0],ymm6[2,0],ymm0[4,4],ymm6[6,4] 1491; AVX-NEXT: vmovaps 16(%rdi), %xmm11 1492; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1],ymm15[2],ymm2[3,4],ymm15[5],ymm2[6,7] 1493; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[1,2],ymm11[0,3],ymm4[5,6],ymm11[4,7] 1494; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,2,3,1,4,6,7,5] 1495; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm6[5,6,7] 1496; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1497; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 1498; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm1[2,0],ymm12[3,0],ymm1[6,4],ymm12[7,4] 1499; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm12[0,0],ymm4[2,0],ymm12[4,4],ymm4[6,4] 1500; AVX-NEXT: vmovaps 208(%rdi), %xmm10 1501; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 1502; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1503; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 1504; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[1,2],ymm10[0,3],ymm6[5,6],ymm10[4,7] 1505; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[0,2,3,1,4,6,7,5] 1506; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4],ymm4[5,6,7] 1507; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm6 # 32-byte Folded Reload 1508; AVX-NEXT: # ymm6 = ymm9[0,1],mem[2],ymm9[3,4],mem[5],ymm9[6,7] 1509; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm13[1,0],ymm6[2,0],ymm13[5,4],ymm6[6,4] 1510; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm9[0,3],ymm6[6,4],ymm9[4,7] 1511; AVX-NEXT: vmovups (%rsp), %ymm9 # 32-byte Reload 1512; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload 1513; AVX-NEXT: # ymm9 = ymm9[0,1],mem[0,3],ymm9[4,5],mem[4,7] 1514; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm9[5,6,7] 1515; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] 1516; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] 1517; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm8[2,0],ymm5[0,3],ymm8[6,4],ymm5[4,7] 1518; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload 1519; AVX-NEXT: # ymm8 = ymm14[0,1],mem[0,3],ymm14[4,5],mem[4,7] 1520; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm8[5,6,7] 1521; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm3 # 32-byte Folded Reload 1522; AVX-NEXT: # ymm3 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] 1523; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm11[1,0],ymm3[2,0],ymm11[5,4],ymm3[6,4] 1524; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm15[0,3],ymm3[6,4],ymm15[4,7] 1525; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1],ymm7[0,3],ymm0[4,5],ymm7[4,7] 1526; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm0[5,6,7] 1527; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 1528; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm10[1,0],ymm1[2,0],ymm10[5,4],ymm1[6,4] 1529; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm2[0,3],ymm1[6,4],ymm2[4,7] 1530; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm2 # 32-byte Folded Reload 1531; AVX-NEXT: # ymm2 = ymm12[0,1],mem[0,3],ymm12[4,5],mem[4,7] 1532; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 1533; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1534; AVX-NEXT: vmovaps %ymm2, (%rsi) 1535; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1536; AVX-NEXT: vmovaps %ymm2, 64(%rsi) 1537; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1538; AVX-NEXT: vmovaps %ymm2, 96(%rsi) 1539; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1540; AVX-NEXT: vmovaps %ymm2, 32(%rsi) 1541; AVX-NEXT: vmovaps %ymm4, 64(%rdx) 1542; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1543; AVX-NEXT: vmovaps %ymm2, (%rdx) 1544; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1545; AVX-NEXT: vmovaps %ymm2, 96(%rdx) 1546; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1547; AVX-NEXT: vmovaps %ymm2, 32(%rdx) 1548; AVX-NEXT: vmovaps %ymm1, 64(%rcx) 1549; AVX-NEXT: vmovaps %ymm0, (%rcx) 1550; AVX-NEXT: vmovaps %ymm5, 96(%rcx) 1551; AVX-NEXT: vmovaps %ymm6, 32(%rcx) 1552; AVX-NEXT: addq $392, %rsp # imm = 0x188 1553; AVX-NEXT: vzeroupper 1554; AVX-NEXT: retq 1555; 1556; AVX2-LABEL: load_i32_stride3_vf32: 1557; AVX2: # %bb.0: 1558; AVX2-NEXT: subq $40, %rsp 1559; AVX2-NEXT: vmovaps 256(%rdi), %ymm15 1560; AVX2-NEXT: vmovaps 224(%rdi), %ymm5 1561; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 1562; AVX2-NEXT: vmovaps 352(%rdi), %ymm4 1563; AVX2-NEXT: vmovaps 320(%rdi), %ymm8 1564; AVX2-NEXT: vmovaps 288(%rdi), %ymm10 1565; AVX2-NEXT: vmovaps 160(%rdi), %ymm13 1566; AVX2-NEXT: vmovaps 128(%rdi), %ymm1 1567; AVX2-NEXT: vmovaps (%rdi), %ymm6 1568; AVX2-NEXT: vmovaps 32(%rdi), %ymm9 1569; AVX2-NEXT: vmovaps 64(%rdi), %ymm7 1570; AVX2-NEXT: vmovaps 96(%rdi), %ymm0 1571; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1572; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] 1573; AVX2-NEXT: vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5] 1574; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm2 1575; AVX2-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 1576; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7] 1577; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] 1578; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm2 1579; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1580; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] 1581; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] 1582; AVX2-NEXT: vpermps %ymm11, %ymm12, %ymm2 1583; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1584; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] 1585; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] 1586; AVX2-NEXT: vpermps %ymm14, %ymm12, %ymm2 1587; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1588; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1589; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] 1590; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6] 1591; AVX2-NEXT: vpermps %ymm12, %ymm2, %ymm11 1592; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1593; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7] 1594; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] 1595; AVX2-NEXT: vpermps %ymm11, %ymm2, %ymm11 1596; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] 1597; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] 1598; AVX2-NEXT: vpermps %ymm12, %ymm2, %ymm12 1599; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] 1600; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] 1601; AVX2-NEXT: vpermps %ymm14, %ymm2, %ymm2 1602; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1603; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u] 1604; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0 1605; AVX2-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7] 1606; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] 1607; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] 1608; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] 1609; AVX2-NEXT: vpermps %ymm8, %ymm1, %ymm8 1610; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] 1611; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 1612; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] 1613; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] 1614; AVX2-NEXT: vpermps %ymm6, %ymm1, %ymm6 1615; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7] 1616; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 1617; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] 1618; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] 1619; AVX2-NEXT: vpermps %ymm3, %ymm1, %ymm1 1620; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7] 1621; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 1622; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] 1623; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 1624; AVX2-NEXT: vmovaps %ymm3, 64(%rsi) 1625; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 1626; AVX2-NEXT: vmovaps %ymm3, (%rsi) 1627; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 1628; AVX2-NEXT: vmovaps %ymm3, 96(%rsi) 1629; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 1630; AVX2-NEXT: vmovaps %ymm3, 32(%rsi) 1631; AVX2-NEXT: vmovaps %ymm2, 64(%rdx) 1632; AVX2-NEXT: vmovaps %ymm12, (%rdx) 1633; AVX2-NEXT: vmovaps %ymm11, 96(%rdx) 1634; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1635; AVX2-NEXT: vmovaps %ymm2, 32(%rdx) 1636; AVX2-NEXT: vmovaps %ymm1, 64(%rcx) 1637; AVX2-NEXT: vmovaps %ymm6, (%rcx) 1638; AVX2-NEXT: vmovaps %ymm4, 96(%rcx) 1639; AVX2-NEXT: vmovaps %ymm0, 32(%rcx) 1640; AVX2-NEXT: addq $40, %rsp 1641; AVX2-NEXT: vzeroupper 1642; AVX2-NEXT: retq 1643; 1644; AVX2-FP-LABEL: load_i32_stride3_vf32: 1645; AVX2-FP: # %bb.0: 1646; AVX2-FP-NEXT: subq $40, %rsp 1647; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm15 1648; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm5 1649; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 1650; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm4 1651; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm8 1652; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm10 1653; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm13 1654; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1 1655; AVX2-FP-NEXT: vmovaps (%rdi), %ymm6 1656; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm9 1657; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm7 1658; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0 1659; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7] 1660; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm13[2],ymm11[3,4],ymm13[5],ymm11[6,7] 1661; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5] 1662; AVX2-FP-NEXT: vpermps %ymm11, %ymm12, %ymm2 1663; AVX2-FP-NEXT: vmovups %ymm2, (%rsp) # 32-byte Spill 1664; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm8[1],ymm10[2,3],ymm8[4],ymm10[5,6],ymm8[7] 1665; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm4[2],ymm11[3,4],ymm4[5],ymm11[6,7] 1666; AVX2-FP-NEXT: vpermps %ymm11, %ymm12, %ymm2 1667; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1668; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm6[0],ymm9[1],ymm6[2,3],ymm9[4],ymm6[5,6],ymm9[7] 1669; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] 1670; AVX2-FP-NEXT: vpermps %ymm11, %ymm12, %ymm2 1671; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1672; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7] 1673; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] 1674; AVX2-FP-NEXT: vpermps %ymm14, %ymm12, %ymm2 1675; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1676; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1677; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] 1678; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,0,3,6] 1679; AVX2-FP-NEXT: vpermps %ymm12, %ymm2, %ymm11 1680; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1681; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0,1],ymm8[2],ymm10[3,4],ymm8[5],ymm10[6,7] 1682; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm4[0],ymm11[1,2],ymm4[3],ymm11[4,5],ymm4[6],ymm11[7] 1683; AVX2-FP-NEXT: vpermps %ymm11, %ymm2, %ymm11 1684; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm6[0,1],ymm9[2],ymm6[3,4],ymm9[5],ymm6[6,7] 1685; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm7[0],ymm12[1,2],ymm7[3],ymm12[4,5],ymm7[6],ymm12[7] 1686; AVX2-FP-NEXT: vpermps %ymm12, %ymm2, %ymm12 1687; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm5[2],ymm3[3,4],ymm5[5],ymm3[6,7] 1688; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0],ymm14[1,2],ymm15[3],ymm14[4,5],ymm15[6],ymm14[7] 1689; AVX2-FP-NEXT: vpermps %ymm14, %ymm2, %ymm2 1690; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1691; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u] 1692; AVX2-FP-NEXT: vpermps %ymm0, %ymm1, %ymm0 1693; AVX2-FP-NEXT: vshufps {{.*#+}} ymm13 = ymm13[0,1,0,3,4,5,4,7] 1694; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm13[0,1,0,3] 1695; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5,6,7] 1696; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1],ymm10[2],ymm8[3,4],ymm10[5],ymm8[6,7] 1697; AVX2-FP-NEXT: vpermps %ymm8, %ymm1, %ymm8 1698; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7] 1699; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 1700; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7] 1701; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm6[2],ymm9[3,4],ymm6[5],ymm9[6,7] 1702; AVX2-FP-NEXT: vpermps %ymm6, %ymm1, %ymm6 1703; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7] 1704; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 1705; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] 1706; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1],ymm3[2],ymm5[3,4],ymm3[5],ymm5[6,7] 1707; AVX2-FP-NEXT: vpermps %ymm3, %ymm1, %ymm1 1708; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm15[0,1,0,3,4,5,4,7] 1709; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 1710; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5,6,7] 1711; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 1712; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi) 1713; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 1714; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi) 1715; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 1716; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi) 1717; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 1718; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi) 1719; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx) 1720; AVX2-FP-NEXT: vmovaps %ymm12, (%rdx) 1721; AVX2-FP-NEXT: vmovaps %ymm11, 96(%rdx) 1722; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1723; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx) 1724; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx) 1725; AVX2-FP-NEXT: vmovaps %ymm6, (%rcx) 1726; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx) 1727; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rcx) 1728; AVX2-FP-NEXT: addq $40, %rsp 1729; AVX2-FP-NEXT: vzeroupper 1730; AVX2-FP-NEXT: retq 1731; 1732; AVX2-FCP-LABEL: load_i32_stride3_vf32: 1733; AVX2-FCP: # %bb.0: 1734; AVX2-FCP-NEXT: subq $72, %rsp 1735; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm0 1736; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2 1737; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1738; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 1739; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm4 1740; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm6 1741; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm7 1742; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13 1743; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm14 1744; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm8 1745; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm9 1746; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm10 1747; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm15 1748; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6],ymm14[7] 1749; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm13[2],ymm3[3,4],ymm13[5],ymm3[6,7] 1750; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm12 = [0,3,6,1,4,7,2,5] 1751; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm3 1752; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1753; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] 1754; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7] 1755; AVX2-FCP-NEXT: vpermps %ymm5, %ymm12, %ymm3 1756; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill 1757; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5,6],ymm9[7] 1758; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1],ymm10[2],ymm11[3,4],ymm10[5],ymm11[6,7] 1759; AVX2-FCP-NEXT: vpermps %ymm11, %ymm12, %ymm3 1760; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1761; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 1762; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm0[2],ymm3[3,4],ymm0[5],ymm3[6,7] 1763; AVX2-FCP-NEXT: vmovaps %ymm0, %ymm2 1764; AVX2-FCP-NEXT: vpermps %ymm3, %ymm12, %ymm0 1765; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1766; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7] 1767; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0],ymm12[1,2],ymm13[3],ymm12[4,5],ymm13[6],ymm12[7] 1768; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm3 = [1,4,7,2,5,0,3,6] 1769; AVX2-FCP-NEXT: vpermps %ymm12, %ymm3, %ymm5 1770; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 1771; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7] 1772; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7] 1773; AVX2-FCP-NEXT: vpermps %ymm5, %ymm3, %ymm5 1774; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7] 1775; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm10[0],ymm11[1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7] 1776; AVX2-FCP-NEXT: vpermps %ymm11, %ymm3, %ymm11 1777; AVX2-FCP-NEXT: vmovaps %ymm1, %ymm0 1778; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 1779; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7] 1780; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm2[0],ymm12[1,2],ymm2[3],ymm12[4,5],ymm2[6],ymm12[7] 1781; AVX2-FCP-NEXT: vpermps %ymm12, %ymm3, %ymm3 1782; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7] 1783; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm13[1],ymm12[2,3],ymm13[4],ymm12[5,6],ymm13[7] 1784; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm7[2],ymm6[3,4],ymm7[5],ymm6[6,7] 1785; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3],ymm4[4],ymm6[5,6],ymm4[7] 1786; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7] 1787; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3],ymm10[4],ymm6[5,6],ymm10[7] 1788; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7] 1789; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 1790; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm2 = [2,5,0,3,6,1,4,7] 1791; AVX2-FCP-NEXT: vpermps %ymm12, %ymm2, %ymm7 1792; AVX2-FCP-NEXT: vpermps %ymm4, %ymm2, %ymm4 1793; AVX2-FCP-NEXT: vpermps %ymm6, %ymm2, %ymm6 1794; AVX2-FCP-NEXT: vpermps %ymm1, %ymm2, %ymm1 1795; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1796; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rsi) 1797; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1798; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi) 1799; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload 1800; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi) 1801; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 1802; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi) 1803; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx) 1804; AVX2-FCP-NEXT: vmovaps %ymm11, (%rdx) 1805; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx) 1806; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 1807; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rdx) 1808; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx) 1809; AVX2-FCP-NEXT: vmovaps %ymm6, (%rcx) 1810; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx) 1811; AVX2-FCP-NEXT: vmovaps %ymm7, 32(%rcx) 1812; AVX2-FCP-NEXT: addq $72, %rsp 1813; AVX2-FCP-NEXT: vzeroupper 1814; AVX2-FCP-NEXT: retq 1815; 1816; AVX512-LABEL: load_i32_stride3_vf32: 1817; AVX512: # %bb.0: 1818; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm0 1819; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm1 1820; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 1821; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm3 1822; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4 1823; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5 1824; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1825; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 1826; AVX512-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 1827; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1828; AVX512-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 1829; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1830; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 1831; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1832; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9 1833; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 1834; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1835; AVX512-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 1836; AVX512-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 1837; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 1838; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1839; AVX512-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 1840; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1841; AVX512-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 1842; AVX512-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 1843; AVX512-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 1844; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rsi) 1845; AVX512-NEXT: vmovdqa64 %zmm6, (%rsi) 1846; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rdx) 1847; AVX512-NEXT: vmovdqa64 %zmm8, (%rdx) 1848; AVX512-NEXT: vmovdqa64 %zmm5, 64(%rcx) 1849; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) 1850; AVX512-NEXT: vzeroupper 1851; AVX512-NEXT: retq 1852; 1853; AVX512-FCP-LABEL: load_i32_stride3_vf32: 1854; AVX512-FCP: # %bb.0: 1855; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 1856; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 1857; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 1858; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 1859; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 1860; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 1861; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1862; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 1863; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 1864; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1865; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 1866; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1867; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 1868; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1869; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 1870; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 1871; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1872; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 1873; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 1874; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 1875; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1876; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 1877; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1878; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 1879; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 1880; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 1881; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 1882; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) 1883; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx) 1884; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) 1885; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) 1886; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 1887; AVX512-FCP-NEXT: vzeroupper 1888; AVX512-FCP-NEXT: retq 1889; 1890; AVX512DQ-LABEL: load_i32_stride3_vf32: 1891; AVX512DQ: # %bb.0: 1892; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm0 1893; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm1 1894; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 1895; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm3 1896; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4 1897; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5 1898; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1899; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 1900; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 1901; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1902; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 1903; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1904; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 1905; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1906; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9 1907; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 1908; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1909; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 1910; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 1911; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 1912; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1913; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 1914; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1915; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 1916; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 1917; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 1918; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rsi) 1919; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%rsi) 1920; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rdx) 1921; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rdx) 1922; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%rcx) 1923; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx) 1924; AVX512DQ-NEXT: vzeroupper 1925; AVX512DQ-NEXT: retq 1926; 1927; AVX512DQ-FCP-LABEL: load_i32_stride3_vf32: 1928; AVX512DQ-FCP: # %bb.0: 1929; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 1930; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 1931; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 1932; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 1933; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 1934; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 1935; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1936; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 1937; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 1938; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1939; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 1940; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1941; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 1942; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1943; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 1944; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 1945; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1946; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 1947; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 1948; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 1949; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1950; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 1951; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1952; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 1953; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 1954; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 1955; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 1956; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) 1957; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx) 1958; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) 1959; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) 1960; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 1961; AVX512DQ-FCP-NEXT: vzeroupper 1962; AVX512DQ-FCP-NEXT: retq 1963; 1964; AVX512BW-LABEL: load_i32_stride3_vf32: 1965; AVX512BW: # %bb.0: 1966; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm0 1967; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm1 1968; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 1969; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm3 1970; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4 1971; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5 1972; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 1973; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 1974; AVX512BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 1975; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 1976; AVX512BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 1977; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 1978; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 1979; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 1980; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9 1981; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 1982; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 1983; AVX512BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 1984; AVX512BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 1985; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 1986; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 1987; AVX512BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 1988; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 1989; AVX512BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 1990; AVX512BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 1991; AVX512BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 1992; AVX512BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 1993; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rsi) 1994; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) 1995; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rdx) 1996; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) 1997; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rcx) 1998; AVX512BW-NEXT: vzeroupper 1999; AVX512BW-NEXT: retq 2000; 2001; AVX512BW-FCP-LABEL: load_i32_stride3_vf32: 2002; AVX512BW-FCP: # %bb.0: 2003; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 2004; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 2005; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 2006; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 2007; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 2008; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 2009; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 2010; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 2011; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 2012; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 2013; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 2014; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2015; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 2016; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 2017; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 2018; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 2019; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 2020; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 2021; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 2022; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 2023; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 2024; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 2025; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 2026; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 2027; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 2028; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 2029; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 2030; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) 2031; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx) 2032; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) 2033; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) 2034; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 2035; AVX512BW-FCP-NEXT: vzeroupper 2036; AVX512BW-FCP-NEXT: retq 2037; 2038; AVX512DQ-BW-LABEL: load_i32_stride3_vf32: 2039; AVX512DQ-BW: # %bb.0: 2040; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm0 2041; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm1 2042; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 2043; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm3 2044; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4 2045; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5 2046; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 2047; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 2048; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 2049; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 2050; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 2051; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2052; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 2053; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 2054; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9 2055; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 2056; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 2057; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 2058; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 2059; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 2060; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 2061; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 2062; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 2063; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 2064; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 2065; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 2066; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 64(%rsi) 2067; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rsi) 2068; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rdx) 2069; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rdx) 2070; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rcx) 2071; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rcx) 2072; AVX512DQ-BW-NEXT: vzeroupper 2073; AVX512DQ-BW-NEXT: retq 2074; 2075; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf32: 2076; AVX512DQ-BW-FCP: # %bb.0: 2077; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 2078; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm1 2079; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 2080; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm3 2081; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4 2082; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5 2083; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 2084; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 2085; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm6, %zmm7 2086; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 2087; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm8, %zmm7 2088; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6 2089; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm6 2090; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 2091; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9 2092; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm9 2093; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 2094; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm10, %zmm9 2095; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm3, %zmm8 2096; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm8 2097; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 2098; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm10, %zmm5 2099; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 2100; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm1, %zmm5 2101; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm10, %zmm2 2102; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm1, %zmm2 2103; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 64(%rsi) 2104; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rsi) 2105; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rdx) 2106; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) 2107; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rcx) 2108; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 2109; AVX512DQ-BW-FCP-NEXT: vzeroupper 2110; AVX512DQ-BW-FCP-NEXT: retq 2111 %wide.vec = load <96 x i32>, ptr %in.vec, align 64 2112 %strided.vec0 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93> 2113 %strided.vec1 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94> 2114 %strided.vec2 = shufflevector <96 x i32> %wide.vec, <96 x i32> poison, <32 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95> 2115 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64 2116 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64 2117 store <32 x i32> %strided.vec2, ptr %out.vec2, align 64 2118 ret void 2119} 2120 2121define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2) nounwind { 2122; SSE-LABEL: load_i32_stride3_vf64: 2123; SSE: # %bb.0: 2124; SSE-NEXT: subq $1112, %rsp # imm = 0x458 2125; SSE-NEXT: movaps 624(%rdi), %xmm2 2126; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2127; SSE-NEXT: movaps 656(%rdi), %xmm4 2128; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2129; SSE-NEXT: movaps 640(%rdi), %xmm10 2130; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2131; SSE-NEXT: movaps 432(%rdi), %xmm6 2132; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2133; SSE-NEXT: movaps 464(%rdi), %xmm5 2134; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2135; SSE-NEXT: movaps 448(%rdi), %xmm11 2136; SSE-NEXT: movaps %xmm11, (%rsp) # 16-byte Spill 2137; SSE-NEXT: movaps 240(%rdi), %xmm7 2138; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2139; SSE-NEXT: movaps 272(%rdi), %xmm3 2140; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2141; SSE-NEXT: movaps 256(%rdi), %xmm13 2142; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2143; SSE-NEXT: movaps 48(%rdi), %xmm9 2144; SSE-NEXT: movaps 80(%rdi), %xmm1 2145; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2146; SSE-NEXT: movaps 64(%rdi), %xmm12 2147; SSE-NEXT: movaps %xmm12, %xmm0 2148; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2149; SSE-NEXT: movaps %xmm9, %xmm1 2150; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2151; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2152; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2153; SSE-NEXT: movaps %xmm13, %xmm0 2154; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm3[1,0] 2155; SSE-NEXT: movaps %xmm7, %xmm1 2156; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2157; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2158; SSE-NEXT: movaps %xmm11, %xmm0 2159; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm5[1,0] 2160; SSE-NEXT: movaps %xmm6, %xmm1 2161; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2162; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2163; SSE-NEXT: movaps %xmm10, %xmm0 2164; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[1,0] 2165; SSE-NEXT: movaps %xmm2, %xmm1 2166; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2167; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2168; SSE-NEXT: movaps 16(%rdi), %xmm0 2169; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2170; SSE-NEXT: movaps 32(%rdi), %xmm1 2171; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2172; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2173; SSE-NEXT: movaps (%rdi), %xmm1 2174; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2175; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2176; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2177; SSE-NEXT: movaps 224(%rdi), %xmm1 2178; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2179; SSE-NEXT: movaps 208(%rdi), %xmm0 2180; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2181; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2182; SSE-NEXT: movaps 192(%rdi), %xmm1 2183; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2184; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2185; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2186; SSE-NEXT: movaps 416(%rdi), %xmm1 2187; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2188; SSE-NEXT: movaps 400(%rdi), %xmm0 2189; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2190; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2191; SSE-NEXT: movaps 384(%rdi), %xmm1 2192; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2193; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2194; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2195; SSE-NEXT: movaps 608(%rdi), %xmm1 2196; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2197; SSE-NEXT: movaps 592(%rdi), %xmm0 2198; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2199; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2200; SSE-NEXT: movaps 576(%rdi), %xmm1 2201; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2202; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2203; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2204; SSE-NEXT: movaps 176(%rdi), %xmm10 2205; SSE-NEXT: movaps 160(%rdi), %xmm8 2206; SSE-NEXT: movaps %xmm8, %xmm0 2207; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[1,0] 2208; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2209; SSE-NEXT: movaps 144(%rdi), %xmm2 2210; SSE-NEXT: movaps %xmm2, %xmm1 2211; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2212; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2213; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2214; SSE-NEXT: movaps 368(%rdi), %xmm1 2215; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2216; SSE-NEXT: movaps 352(%rdi), %xmm15 2217; SSE-NEXT: movaps %xmm15, %xmm0 2218; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2219; SSE-NEXT: movaps 336(%rdi), %xmm14 2220; SSE-NEXT: movaps %xmm14, %xmm1 2221; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2222; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2223; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2224; SSE-NEXT: movaps 560(%rdi), %xmm1 2225; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2226; SSE-NEXT: movaps 544(%rdi), %xmm0 2227; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2228; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2229; SSE-NEXT: movaps 528(%rdi), %xmm1 2230; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2231; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2232; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2233; SSE-NEXT: movaps 752(%rdi), %xmm1 2234; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2235; SSE-NEXT: movaps 736(%rdi), %xmm0 2236; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2237; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[1,0] 2238; SSE-NEXT: movaps 720(%rdi), %xmm1 2239; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2240; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,2] 2241; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2242; SSE-NEXT: movaps 128(%rdi), %xmm6 2243; SSE-NEXT: movaps 112(%rdi), %xmm4 2244; SSE-NEXT: movaps %xmm4, %xmm1 2245; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm6[1,0] 2246; SSE-NEXT: movaps 96(%rdi), %xmm3 2247; SSE-NEXT: movaps %xmm3, %xmm7 2248; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2249; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] 2250; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2251; SSE-NEXT: movaps 320(%rdi), %xmm13 2252; SSE-NEXT: movaps 304(%rdi), %xmm11 2253; SSE-NEXT: movaps %xmm11, %xmm1 2254; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm13[1,0] 2255; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2256; SSE-NEXT: movaps 288(%rdi), %xmm5 2257; SSE-NEXT: movaps %xmm5, %xmm7 2258; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2259; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] 2260; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2261; SSE-NEXT: movaps 512(%rdi), %xmm0 2262; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2263; SSE-NEXT: movaps 496(%rdi), %xmm1 2264; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2265; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[1,0] 2266; SSE-NEXT: movaps 480(%rdi), %xmm7 2267; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2268; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] 2269; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2270; SSE-NEXT: movaps 704(%rdi), %xmm7 2271; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2272; SSE-NEXT: movaps 688(%rdi), %xmm1 2273; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2274; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm7[1,0] 2275; SSE-NEXT: movaps 672(%rdi), %xmm7 2276; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2277; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,3],xmm1[0,2] 2278; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2279; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2280; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[1,0],xmm12[0,0] 2281; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 2282; SSE-NEXT: # xmm12 = xmm12[3,1],mem[2,3] 2283; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,2],xmm12[0,2] 2284; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2285; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2286; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0] 2287; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,1],xmm10[2,3] 2288; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm8[0,2] 2289; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2290; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2291; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,0],xmm4[0,0] 2292; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,1],xmm6[2,3] 2293; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2] 2294; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2295; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2296; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2297; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0] 2298; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2299; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2300; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,2],xmm1[0,2] 2301; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2302; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload 2303; SSE-NEXT: movaps %xmm9, %xmm0 2304; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2305; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm3[0,0] 2306; SSE-NEXT: movaps %xmm3, %xmm1 2307; SSE-NEXT: movaps %xmm3, %xmm12 2308; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2309; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2310; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2311; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2312; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2313; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm15[0,0] 2314; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 2315; SSE-NEXT: # xmm15 = xmm15[3,1],mem[2,3] 2316; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm15[0,2] 2317; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2318; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2319; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,0],xmm11[0,0] 2320; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[3,1],xmm13[2,3] 2321; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm11[0,2] 2322; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2323; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2324; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 2325; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2326; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2327; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2328; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2329; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2330; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload 2331; SSE-NEXT: movaps %xmm7, %xmm0 2332; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2333; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0] 2334; SSE-NEXT: movaps %xmm4, %xmm1 2335; SSE-NEXT: movaps %xmm4, %xmm8 2336; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2337; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2338; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2339; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2340; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2341; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2342; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2343; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2344; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2345; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2346; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2347; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2348; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2349; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2350; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2351; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[2,3] 2352; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2353; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2354; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2355; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2356; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2357; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2358; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2359; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2360; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2361; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2362; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2363; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm14[0,0] 2364; SSE-NEXT: movaps %xmm14, %xmm1 2365; SSE-NEXT: movaps %xmm14, %xmm3 2366; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2367; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2368; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2369; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2370; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2371; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2372; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0] 2373; SSE-NEXT: shufps $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2374; SSE-NEXT: # xmm1 = xmm1[3,1],mem[2,3] 2375; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] 2376; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2377; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload 2378; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2379; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[1,0],xmm1[0,0] 2380; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2381; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm4[2,3] 2382; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,2],xmm1[0,2] 2383; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2384; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[2,3,2,3] 2385; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2386; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0] 2387; SSE-NEXT: movaps %xmm2, %xmm11 2388; SSE-NEXT: movaps %xmm0, %xmm1 2389; SSE-NEXT: movaps %xmm0, %xmm2 2390; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2391; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3] 2392; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,2],xmm1[0,2] 2393; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2394; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1] 2395; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1] 2396; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[0,3] 2397; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2398; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2399; SSE-NEXT: # xmm1 = mem[1,1,1,1] 2400; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2401; SSE-NEXT: # xmm0 = mem[2,3,2,3] 2402; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2403; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2404; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] 2405; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2406; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2407; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2408; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload 2409; SSE-NEXT: # xmm15 = mem[2,3,2,3] 2410; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1] 2411; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm6[0,3] 2412; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2413; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2414; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 2415; SSE-NEXT: # xmm13 = mem[2,3,2,3] 2416; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1] 2417; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload 2418; SSE-NEXT: # xmm13 = xmm13[0,1],mem[0,3] 2419; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1] 2420; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm9[2,3,2,3] 2421; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1] 2422; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload 2423; SSE-NEXT: # xmm12 = xmm12[0,1],mem[0,3] 2424; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2425; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2426; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 2427; SSE-NEXT: # xmm11 = mem[2,3,2,3] 2428; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1] 2429; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload 2430; SSE-NEXT: # xmm11 = xmm11[0,1],mem[0,3] 2431; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2432; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2433; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 2434; SSE-NEXT: # xmm10 = mem[2,3,2,3] 2435; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1] 2436; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload 2437; SSE-NEXT: # xmm10 = xmm10[0,1],mem[0,3] 2438; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2439; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2440; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 2441; SSE-NEXT: # xmm9 = mem[2,3,2,3] 2442; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1] 2443; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload 2444; SSE-NEXT: # xmm9 = xmm9[0,1],mem[0,3] 2445; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1] 2446; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3] 2447; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1] 2448; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload 2449; SSE-NEXT: # xmm8 = xmm8[0,1],mem[0,3] 2450; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload 2451; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2452; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 2453; SSE-NEXT: # xmm7 = mem[2,3,2,3] 2454; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1] 2455; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload 2456; SSE-NEXT: # xmm7 = xmm7[0,1],mem[0,3] 2457; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2458; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2459; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload 2460; SSE-NEXT: # xmm6 = mem[2,3,2,3] 2461; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1] 2462; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,3] 2463; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2464; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2465; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2466; SSE-NEXT: # xmm5 = mem[2,3,2,3] 2467; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1] 2468; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload 2469; SSE-NEXT: # xmm5 = xmm5[0,1],mem[0,3] 2470; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,1,1] 2471; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 2472; SSE-NEXT: # xmm3 = mem[2,3,2,3] 2473; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1] 2474; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload 2475; SSE-NEXT: # xmm3 = xmm3[0,1],mem[0,3] 2476; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2477; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2478; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2479; SSE-NEXT: # xmm2 = mem[2,3,2,3] 2480; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2481; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload 2482; SSE-NEXT: # xmm2 = xmm2[0,1],mem[0,3] 2483; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2484; SSE-NEXT: # xmm0 = mem[1,1,1,1] 2485; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2486; SSE-NEXT: # xmm1 = mem[2,3,2,3] 2487; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 2488; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0,3] 2489; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload 2490; SSE-NEXT: # xmm4 = mem[1,1,1,1] 2491; SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2492; SSE-NEXT: # xmm0 = mem[2,3,2,3] 2493; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] 2494; SSE-NEXT: shufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2495; SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,3] 2496; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2497; SSE-NEXT: movaps %xmm4, 224(%rsi) 2498; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2499; SSE-NEXT: movaps %xmm4, 160(%rsi) 2500; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2501; SSE-NEXT: movaps %xmm4, 96(%rsi) 2502; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2503; SSE-NEXT: movaps %xmm4, 32(%rsi) 2504; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2505; SSE-NEXT: movaps %xmm4, 240(%rsi) 2506; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2507; SSE-NEXT: movaps %xmm4, 176(%rsi) 2508; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2509; SSE-NEXT: movaps %xmm4, 112(%rsi) 2510; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2511; SSE-NEXT: movaps %xmm4, 48(%rsi) 2512; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2513; SSE-NEXT: movaps %xmm4, 192(%rsi) 2514; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2515; SSE-NEXT: movaps %xmm4, 128(%rsi) 2516; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2517; SSE-NEXT: movaps %xmm4, 64(%rsi) 2518; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2519; SSE-NEXT: movaps %xmm4, (%rsi) 2520; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2521; SSE-NEXT: movaps %xmm4, 208(%rsi) 2522; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2523; SSE-NEXT: movaps %xmm4, 144(%rsi) 2524; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2525; SSE-NEXT: movaps %xmm4, 80(%rsi) 2526; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2527; SSE-NEXT: movaps %xmm4, 16(%rsi) 2528; SSE-NEXT: movaps %xmm14, 224(%rdx) 2529; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2530; SSE-NEXT: movaps %xmm4, 240(%rdx) 2531; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2532; SSE-NEXT: movaps %xmm4, 192(%rdx) 2533; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2534; SSE-NEXT: movaps %xmm4, 208(%rdx) 2535; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2536; SSE-NEXT: movaps %xmm4, 160(%rdx) 2537; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2538; SSE-NEXT: movaps %xmm4, 176(%rdx) 2539; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2540; SSE-NEXT: movaps %xmm4, 128(%rdx) 2541; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2542; SSE-NEXT: movaps %xmm4, 144(%rdx) 2543; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2544; SSE-NEXT: movaps %xmm4, 96(%rdx) 2545; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2546; SSE-NEXT: movaps %xmm4, 112(%rdx) 2547; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2548; SSE-NEXT: movaps %xmm4, 64(%rdx) 2549; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2550; SSE-NEXT: movaps %xmm4, 80(%rdx) 2551; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2552; SSE-NEXT: movaps %xmm4, 32(%rdx) 2553; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2554; SSE-NEXT: movaps %xmm4, 48(%rdx) 2555; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2556; SSE-NEXT: movaps %xmm4, (%rdx) 2557; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2558; SSE-NEXT: movaps %xmm4, 16(%rdx) 2559; SSE-NEXT: movaps %xmm0, 240(%rcx) 2560; SSE-NEXT: movaps %xmm1, 224(%rcx) 2561; SSE-NEXT: movaps %xmm2, 208(%rcx) 2562; SSE-NEXT: movaps %xmm3, 192(%rcx) 2563; SSE-NEXT: movaps %xmm5, 176(%rcx) 2564; SSE-NEXT: movaps %xmm6, 160(%rcx) 2565; SSE-NEXT: movaps %xmm7, 144(%rcx) 2566; SSE-NEXT: movaps %xmm8, 128(%rcx) 2567; SSE-NEXT: movaps %xmm9, 112(%rcx) 2568; SSE-NEXT: movaps %xmm10, 96(%rcx) 2569; SSE-NEXT: movaps %xmm11, 80(%rcx) 2570; SSE-NEXT: movaps %xmm12, 64(%rcx) 2571; SSE-NEXT: movaps %xmm13, 48(%rcx) 2572; SSE-NEXT: movaps %xmm15, 32(%rcx) 2573; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2574; SSE-NEXT: movaps %xmm0, 16(%rcx) 2575; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2576; SSE-NEXT: movaps %xmm0, (%rcx) 2577; SSE-NEXT: addq $1112, %rsp # imm = 0x458 2578; SSE-NEXT: retq 2579; 2580; AVX-LABEL: load_i32_stride3_vf64: 2581; AVX: # %bb.0: 2582; AVX-NEXT: subq $1384, %rsp # imm = 0x568 2583; AVX-NEXT: vmovaps 544(%rdi), %ymm2 2584; AVX-NEXT: vmovaps 512(%rdi), %ymm3 2585; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2586; AVX-NEXT: vmovaps 480(%rdi), %ymm4 2587; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2588; AVX-NEXT: vmovaps 352(%rdi), %ymm5 2589; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2590; AVX-NEXT: vmovaps 320(%rdi), %ymm6 2591; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2592; AVX-NEXT: vmovaps 288(%rdi), %ymm7 2593; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2594; AVX-NEXT: vmovaps 160(%rdi), %ymm8 2595; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2596; AVX-NEXT: vmovaps 128(%rdi), %ymm9 2597; AVX-NEXT: vmovups %ymm9, (%rsp) # 32-byte Spill 2598; AVX-NEXT: vmovaps 96(%rdi), %ymm0 2599; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2600; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[2,3],ymm9[4],ymm0[5,6],ymm9[7] 2601; AVX-NEXT: vmovaps 112(%rdi), %xmm1 2602; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm9[1,3],ymm1[6,5],ymm9[5,7] 2603; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] 2604; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm8[2,3,0,1] 2605; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2606; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm1[2,0],ymm8[5,4],ymm1[6,4] 2607; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2608; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2609; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2610; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] 2611; AVX-NEXT: vmovaps 304(%rdi), %xmm1 2612; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm6[1,3],ymm1[6,5],ymm6[5,7] 2613; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] 2614; AVX-NEXT: vperm2f128 {{.*#+}} ymm14 = ymm5[2,3,0,1] 2615; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm14[2,0],ymm5[5,4],ymm14[6,4] 2616; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2617; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2618; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2619; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 2620; AVX-NEXT: vmovaps 496(%rdi), %xmm1 2621; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,1],ymm3[1,3],ymm1[6,5],ymm3[5,7] 2622; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,3],ymm1[0,2],ymm0[4,7],ymm1[4,6] 2623; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2624; AVX-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm2[2,3,0,1] 2625; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,0],ymm3[2,0],ymm2[5,4],ymm3[6,4] 2626; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2627; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2628; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2629; AVX-NEXT: vmovaps 704(%rdi), %ymm2 2630; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2631; AVX-NEXT: vmovaps 688(%rdi), %xmm0 2632; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm2[1,3],ymm0[6,5],ymm2[5,7] 2633; AVX-NEXT: vmovaps 672(%rdi), %ymm1 2634; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2635; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5,6],ymm2[7] 2636; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] 2637; AVX-NEXT: vmovaps 736(%rdi), %ymm1 2638; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2639; AVX-NEXT: vperm2f128 {{.*#+}} ymm15 = ymm1[2,3,0,1] 2640; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm15[2,0],ymm1[5,4],ymm15[6,4] 2641; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2642; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2643; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2644; AVX-NEXT: vmovaps 32(%rdi), %ymm7 2645; AVX-NEXT: vmovaps 16(%rdi), %xmm0 2646; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm7[1,3],ymm0[6,5],ymm7[5,7] 2647; AVX-NEXT: vmovaps (%rdi), %ymm1 2648; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2649; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[2,3],ymm7[4],ymm1[5,6],ymm7[7] 2650; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] 2651; AVX-NEXT: vmovaps 64(%rdi), %ymm4 2652; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm4[2,3,0,1] 2653; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2654; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] 2655; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2656; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2657; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2658; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2659; AVX-NEXT: vmovaps 224(%rdi), %ymm6 2660; AVX-NEXT: vmovaps 208(%rdi), %xmm0 2661; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm6[1,3],ymm0[6,5],ymm6[5,7] 2662; AVX-NEXT: vmovaps 192(%rdi), %ymm1 2663; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2664; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm6[1],ymm1[2,3],ymm6[4],ymm1[5,6],ymm6[7] 2665; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] 2666; AVX-NEXT: vmovaps 256(%rdi), %ymm5 2667; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm5[2,3,0,1] 2668; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2669; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[1,0],ymm1[2,0],ymm5[5,4],ymm1[6,4] 2670; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2671; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2672; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2673; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2674; AVX-NEXT: vmovaps 416(%rdi), %ymm12 2675; AVX-NEXT: vmovaps 400(%rdi), %xmm0 2676; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm12[1,3],ymm0[6,5],ymm12[5,7] 2677; AVX-NEXT: vmovaps 384(%rdi), %ymm1 2678; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2679; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7] 2680; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] 2681; AVX-NEXT: vmovaps 448(%rdi), %ymm8 2682; AVX-NEXT: vperm2f128 {{.*#+}} ymm9 = ymm8[2,3,0,1] 2683; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm8[1,0],ymm9[2,0],ymm8[5,4],ymm9[6,4] 2684; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2685; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2686; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2687; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2688; AVX-NEXT: vmovaps 608(%rdi), %ymm10 2689; AVX-NEXT: vmovaps 592(%rdi), %xmm0 2690; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,1],ymm10[1,3],ymm0[6,5],ymm10[5,7] 2691; AVX-NEXT: vmovaps 576(%rdi), %ymm1 2692; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2693; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm10[1],ymm1[2,3],ymm10[4],ymm1[5,6],ymm10[7] 2694; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,3],ymm0[0,2],ymm1[4,7],ymm0[4,6] 2695; AVX-NEXT: vmovaps 640(%rdi), %ymm13 2696; AVX-NEXT: vperm2f128 {{.*#+}} ymm11 = ymm13[2,3,0,1] 2697; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm11[2,0],ymm13[5,4],ymm11[6,4] 2698; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2699; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4] 2700; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] 2701; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2702; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2703; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2704; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm1[3,0],ymm0[6,4],ymm1[7,4] 2705; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] 2706; AVX-NEXT: vmovaps 112(%rdi), %xmm0 2707; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2708; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload 2709; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 2710; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 2711; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm0[0,3],ymm1[5,6],ymm0[4,7] 2712; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] 2713; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm2[5,6,7] 2714; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2715; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2716; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2717; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[3,0],ymm0[6,4],ymm14[7,4] 2718; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm14[0,0],ymm0[2,0],ymm14[4,4],ymm0[6,4] 2719; AVX-NEXT: vmovaps 304(%rdi), %xmm2 2720; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2721; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2722; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 2723; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 2724; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm2[0,3],ymm1[5,6],ymm2[4,7] 2725; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] 2726; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 2727; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2728; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2729; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2730; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm3[3,0],ymm0[6,4],ymm3[7,4] 2731; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm3[0,0],ymm0[2,0],ymm3[4,4],ymm0[6,4] 2732; AVX-NEXT: vmovaps 496(%rdi), %xmm3 2733; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2734; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2735; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 2736; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 2737; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] 2738; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] 2739; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 2740; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2741; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2742; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2743; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm15[3,0],ymm0[6,4],ymm15[7,4] 2744; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm15[0,0],ymm0[2,0],ymm15[4,4],ymm0[6,4] 2745; AVX-NEXT: vmovaps 688(%rdi), %xmm3 2746; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2747; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2748; AVX-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 2749; AVX-NEXT: # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7] 2750; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,2],ymm3[0,3],ymm1[5,6],ymm3[4,7] 2751; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,2,3,1,4,6,7,5] 2752; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5,6,7] 2753; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2754; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2755; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[2,0],ymm1[3,0],ymm4[6,4],ymm1[7,4] 2756; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm0[2,0],ymm1[4,4],ymm0[6,4] 2757; AVX-NEXT: vmovaps 16(%rdi), %xmm4 2758; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2759; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1],ymm7[2],ymm0[3,4],ymm7[5],ymm0[6,7] 2760; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[1,2],ymm4[0,3],ymm2[5,6],ymm4[4,7] 2761; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,2,3,1,4,6,7,5] 2762; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5,6,7] 2763; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2764; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2765; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm5[2,0],ymm2[3,0],ymm5[6,4],ymm2[7,4] 2766; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,0],ymm1[2,0],ymm2[4,4],ymm1[6,4] 2767; AVX-NEXT: vmovaps 208(%rdi), %xmm5 2768; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2769; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1],ymm6[2],ymm1[3,4],ymm6[5],ymm1[6,7] 2770; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[1,2],ymm5[0,3],ymm3[5,6],ymm5[4,7] 2771; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,2,3,1,4,6,7,5] 2772; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5,6,7] 2773; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2774; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2775; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[2,0],ymm9[3,0],ymm8[6,4],ymm9[7,4] 2776; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm9[0,0],ymm2[2,0],ymm9[4,4],ymm2[6,4] 2777; AVX-NEXT: vmovaps 400(%rdi), %xmm8 2778; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 2779; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm2[0,1],ymm12[2],ymm2[3,4],ymm12[5],ymm2[6,7] 2780; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[1,2],ymm8[0,3],ymm15[5,6],ymm8[4,7] 2781; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,2,3,1,4,6,7,5] 2782; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5,6,7] 2783; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2784; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm13[2,0],ymm11[3,0],ymm13[6,4],ymm11[7,4] 2785; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm11[0,0],ymm3[2,0],ymm11[4,4],ymm3[6,4] 2786; AVX-NEXT: vmovaps 592(%rdi), %xmm9 2787; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 2788; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm3[0,1],ymm10[2],ymm3[3,4],ymm10[5],ymm3[6,7] 2789; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[1,2],ymm9[0,3],ymm14[5,6],ymm9[4,7] 2790; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,2,3,1,4,6,7,5] 2791; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] 2792; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2793; AVX-NEXT: vmovups (%rsp), %ymm15 # 32-byte Reload 2794; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm14 # 32-byte Folded Reload 2795; AVX-NEXT: # ymm14 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7] 2796; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload 2797; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm13[1,0],ymm14[2,0],ymm13[5,4],ymm14[6,4] 2798; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0],ymm15[0,3],ymm14[6,4],ymm15[4,7] 2799; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload 2800; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload 2801; AVX-NEXT: # ymm15 = ymm15[0,1],mem[0,3],ymm15[4,5],mem[4,7] 2802; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm15[5,6,7] 2803; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm7[0,1],ymm0[2],ymm7[3,4],ymm0[5],ymm7[6,7] 2804; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm14[2,0],ymm4[5,4],ymm14[6,4] 2805; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm7[0,3],ymm0[6,4],ymm7[4,7] 2806; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2807; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload 2808; AVX-NEXT: # ymm7 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] 2809; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm0[0,1,2,3,4],ymm7[5,6,7] 2810; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload 2811; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm0 # 32-byte Folded Reload 2812; AVX-NEXT: # ymm0 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] 2813; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2814; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm4[1,0],ymm0[2,0],ymm4[5,4],ymm0[6,4] 2815; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm14[0,3],ymm0[6,4],ymm14[4,7] 2816; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2817; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm14 # 32-byte Folded Reload 2818; AVX-NEXT: # ymm14 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] 2819; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3,4],ymm14[5,6,7] 2820; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] 2821; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm5[1,0],ymm0[2,0],ymm5[5,4],ymm0[6,4] 2822; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0],ymm6[0,3],ymm0[6,4],ymm6[4,7] 2823; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 2824; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 2825; AVX-NEXT: # ymm1 = ymm1[0,1],mem[0,3],ymm1[4,5],mem[4,7] 2826; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 2827; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 2828; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm1 # 32-byte Folded Reload 2829; AVX-NEXT: # ymm1 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] 2830; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2831; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm4[1,0],ymm1[2,0],ymm4[5,4],ymm1[6,4] 2832; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0],ymm6[0,3],ymm1[6,4],ymm6[4,7] 2833; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2834; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload 2835; AVX-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] 2836; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm6[5,6,7] 2837; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm12[0,1],ymm2[2],ymm12[3,4],ymm2[5],ymm12[6,7] 2838; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,0],ymm6[2,0],ymm8[5,4],ymm6[6,4] 2839; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0],ymm12[0,3],ymm2[6,4],ymm12[4,7] 2840; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2841; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm5 # 32-byte Folded Reload 2842; AVX-NEXT: # ymm5 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] 2843; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm5[5,6,7] 2844; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 2845; AVX-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload 2846; AVX-NEXT: # ymm5 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] 2847; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2848; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm4[1,0],ymm5[2,0],ymm4[5,4],ymm5[6,4] 2849; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0],ymm6[0,3],ymm5[6,4],ymm6[4,7] 2850; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2851; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm6 # 32-byte Folded Reload 2852; AVX-NEXT: # ymm6 = ymm4[0,1],mem[0,3],ymm4[4,5],mem[4,7] 2853; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] 2854; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm10[0,1],ymm3[2],ymm10[3,4],ymm3[5],ymm10[6,7] 2855; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm9[1,0],ymm6[2,0],ymm9[5,4],ymm6[6,4] 2856; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0],ymm10[0,3],ymm3[6,4],ymm10[4,7] 2857; AVX-NEXT: vshufps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm4 # 32-byte Folded Reload 2858; AVX-NEXT: # ymm4 = ymm11[0,1],mem[0,3],ymm11[4,5],mem[4,7] 2859; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 2860; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2861; AVX-NEXT: vmovaps %ymm4, 192(%rsi) 2862; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2863; AVX-NEXT: vmovaps %ymm4, 128(%rsi) 2864; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2865; AVX-NEXT: vmovaps %ymm4, 64(%rsi) 2866; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2867; AVX-NEXT: vmovaps %ymm4, (%rsi) 2868; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2869; AVX-NEXT: vmovaps %ymm4, 224(%rsi) 2870; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2871; AVX-NEXT: vmovaps %ymm4, 160(%rsi) 2872; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2873; AVX-NEXT: vmovaps %ymm4, 96(%rsi) 2874; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2875; AVX-NEXT: vmovaps %ymm4, 32(%rsi) 2876; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2877; AVX-NEXT: vmovaps %ymm4, 192(%rdx) 2878; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2879; AVX-NEXT: vmovaps %ymm4, 128(%rdx) 2880; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2881; AVX-NEXT: vmovaps %ymm4, 64(%rdx) 2882; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2883; AVX-NEXT: vmovaps %ymm4, (%rdx) 2884; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2885; AVX-NEXT: vmovaps %ymm4, 224(%rdx) 2886; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2887; AVX-NEXT: vmovaps %ymm4, 160(%rdx) 2888; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2889; AVX-NEXT: vmovaps %ymm4, 96(%rdx) 2890; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2891; AVX-NEXT: vmovaps %ymm4, 32(%rdx) 2892; AVX-NEXT: vmovaps %ymm3, 192(%rcx) 2893; AVX-NEXT: vmovaps %ymm5, 224(%rcx) 2894; AVX-NEXT: vmovaps %ymm2, 128(%rcx) 2895; AVX-NEXT: vmovaps %ymm1, 160(%rcx) 2896; AVX-NEXT: vmovaps %ymm0, 64(%rcx) 2897; AVX-NEXT: vmovaps %ymm14, 96(%rcx) 2898; AVX-NEXT: vmovaps %ymm7, (%rcx) 2899; AVX-NEXT: vmovaps %ymm13, 32(%rcx) 2900; AVX-NEXT: addq $1384, %rsp # imm = 0x568 2901; AVX-NEXT: vzeroupper 2902; AVX-NEXT: retq 2903; 2904; AVX2-LABEL: load_i32_stride3_vf64: 2905; AVX2: # %bb.0: 2906; AVX2-NEXT: subq $1032, %rsp # imm = 0x408 2907; AVX2-NEXT: vmovaps 736(%rdi), %ymm2 2908; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2909; AVX2-NEXT: vmovaps 704(%rdi), %ymm3 2910; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2911; AVX2-NEXT: vmovaps 672(%rdi), %ymm4 2912; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2913; AVX2-NEXT: vmovaps 544(%rdi), %ymm5 2914; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2915; AVX2-NEXT: vmovaps 512(%rdi), %ymm6 2916; AVX2-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill 2917; AVX2-NEXT: vmovaps 480(%rdi), %ymm7 2918; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2919; AVX2-NEXT: vmovaps 352(%rdi), %ymm8 2920; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2921; AVX2-NEXT: vmovaps 320(%rdi), %ymm10 2922; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2923; AVX2-NEXT: vmovaps 288(%rdi), %ymm11 2924; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2925; AVX2-NEXT: vmovaps 160(%rdi), %ymm9 2926; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2927; AVX2-NEXT: vmovaps 128(%rdi), %ymm0 2928; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2929; AVX2-NEXT: vmovaps 96(%rdi), %ymm1 2930; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2931; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 2932; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] 2933; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5] 2934; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1 2935; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2936; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] 2937; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] 2938; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1 2939; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2940; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] 2941; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] 2942; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1 2943; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2944; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 2945; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 2946; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm1 2947; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2948; AVX2-NEXT: vmovaps (%rdi), %ymm1 2949; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2950; AVX2-NEXT: vmovaps 32(%rdi), %ymm3 2951; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2952; AVX2-NEXT: vmovaps 64(%rdi), %ymm8 2953; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] 2954; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] 2955; AVX2-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2956; AVX2-NEXT: vpermps %ymm4, %ymm0, %ymm4 2957; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2958; AVX2-NEXT: vmovaps 256(%rdi), %ymm1 2959; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2960; AVX2-NEXT: vmovaps 224(%rdi), %ymm2 2961; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2962; AVX2-NEXT: vmovaps 192(%rdi), %ymm3 2963; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2964; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 2965; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] 2966; AVX2-NEXT: vpermps %ymm6, %ymm0, %ymm6 2967; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2968; AVX2-NEXT: vmovaps 448(%rdi), %ymm13 2969; AVX2-NEXT: vmovaps 416(%rdi), %ymm12 2970; AVX2-NEXT: vmovaps 384(%rdi), %ymm14 2971; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] 2972; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2973; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2974; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] 2975; AVX2-NEXT: vpermps %ymm10, %ymm0, %ymm10 2976; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2977; AVX2-NEXT: vmovaps 640(%rdi), %ymm6 2978; AVX2-NEXT: vmovaps 608(%rdi), %ymm5 2979; AVX2-NEXT: vmovaps 576(%rdi), %ymm7 2980; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] 2981; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2982; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2983; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7] 2984; AVX2-NEXT: vpermps %ymm15, %ymm0, %ymm0 2985; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2986; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 2987; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 2988; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] 2989; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 2990; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] 2991; AVX2-NEXT: vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6] 2992; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0 2993; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 2994; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 2995; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2996; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 2997; AVX2-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 2998; AVX2-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 2999; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0 3000; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3001; AVX2-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3002; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3003; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 3004; AVX2-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3005; AVX2-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3006; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0 3007; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3008; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3009; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3010; AVX2-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 3011; AVX2-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3012; AVX2-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3013; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0 3014; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3015; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3016; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3017; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3018; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] 3019; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0 3020; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3021; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 3022; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3023; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] 3024; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3025; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] 3026; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm0 3027; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3028; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] 3029; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] 3030; AVX2-NEXT: vmovaps %ymm13, %ymm14 3031; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm13 3032; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] 3033; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] 3034; AVX2-NEXT: vmovaps %ymm6, %ymm7 3035; AVX2-NEXT: vpermps %ymm0, %ymm15, %ymm15 3036; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] 3037; AVX2-NEXT: vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u] 3038; AVX2-NEXT: vpermps %ymm0, %ymm12, %ymm0 3039; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7] 3040; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] 3041; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7] 3042; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 3043; AVX2-NEXT: vpermps %ymm0, %ymm12, %ymm0 3044; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload 3045; AVX2-NEXT: # ymm1 = mem[0,1,0,3,4,5,4,7] 3046; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 3047; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 3048; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3049; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 3050; AVX2-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 3051; AVX2-NEXT: vpermps %ymm1, %ymm12, %ymm1 3052; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 3053; AVX2-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] 3054; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 3055; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 3056; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] 3057; AVX2-NEXT: vpermps %ymm2, %ymm12, %ymm2 3058; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7] 3059; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 3060; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 3061; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 3062; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3063; AVX2-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 3064; AVX2-NEXT: vpermps %ymm3, %ymm12, %ymm3 3065; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 3066; AVX2-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] 3067; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 3068; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 3069; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3070; AVX2-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 3071; AVX2-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] 3072; AVX2-NEXT: vpermps %ymm4, %ymm12, %ymm4 3073; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7] 3074; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 3075; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 3076; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3077; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 3078; AVX2-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] 3079; AVX2-NEXT: vpermps %ymm5, %ymm12, %ymm5 3080; AVX2-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 3081; AVX2-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] 3082; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] 3083; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] 3084; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 3085; AVX2-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 3086; AVX2-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] 3087; AVX2-NEXT: vpermps %ymm6, %ymm12, %ymm6 3088; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7] 3089; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 3090; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] 3091; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3092; AVX2-NEXT: vmovaps %ymm7, 192(%rsi) 3093; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3094; AVX2-NEXT: vmovaps %ymm7, 128(%rsi) 3095; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3096; AVX2-NEXT: vmovaps %ymm7, 64(%rsi) 3097; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3098; AVX2-NEXT: vmovaps %ymm7, (%rsi) 3099; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3100; AVX2-NEXT: vmovaps %ymm7, 224(%rsi) 3101; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3102; AVX2-NEXT: vmovaps %ymm7, 160(%rsi) 3103; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3104; AVX2-NEXT: vmovaps %ymm7, 96(%rsi) 3105; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3106; AVX2-NEXT: vmovaps %ymm7, 32(%rsi) 3107; AVX2-NEXT: vmovaps %ymm15, 192(%rdx) 3108; AVX2-NEXT: vmovaps %ymm13, 128(%rdx) 3109; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3110; AVX2-NEXT: vmovaps %ymm7, 64(%rdx) 3111; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3112; AVX2-NEXT: vmovaps %ymm7, (%rdx) 3113; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3114; AVX2-NEXT: vmovaps %ymm7, 224(%rdx) 3115; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3116; AVX2-NEXT: vmovaps %ymm7, 160(%rdx) 3117; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3118; AVX2-NEXT: vmovaps %ymm7, 96(%rdx) 3119; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3120; AVX2-NEXT: vmovaps %ymm7, 32(%rdx) 3121; AVX2-NEXT: vmovaps %ymm6, 192(%rcx) 3122; AVX2-NEXT: vmovaps %ymm5, 224(%rcx) 3123; AVX2-NEXT: vmovaps %ymm4, 128(%rcx) 3124; AVX2-NEXT: vmovaps %ymm3, 160(%rcx) 3125; AVX2-NEXT: vmovaps %ymm2, 64(%rcx) 3126; AVX2-NEXT: vmovaps %ymm1, 96(%rcx) 3127; AVX2-NEXT: vmovaps %ymm0, (%rcx) 3128; AVX2-NEXT: vmovaps %ymm9, 32(%rcx) 3129; AVX2-NEXT: addq $1032, %rsp # imm = 0x408 3130; AVX2-NEXT: vzeroupper 3131; AVX2-NEXT: retq 3132; 3133; AVX2-FP-LABEL: load_i32_stride3_vf64: 3134; AVX2-FP: # %bb.0: 3135; AVX2-FP-NEXT: subq $1032, %rsp # imm = 0x408 3136; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2 3137; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3138; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3 3139; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3140; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4 3141; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3142; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm5 3143; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3144; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm6 3145; AVX2-FP-NEXT: vmovups %ymm6, (%rsp) # 32-byte Spill 3146; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm7 3147; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3148; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm8 3149; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3150; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm10 3151; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3152; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm11 3153; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3154; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm9 3155; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3156; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm0 3157; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3158; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1 3159; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3160; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5,6],ymm0[7] 3161; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm9[2],ymm0[3,4],ymm9[5],ymm0[6,7] 3162; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5] 3163; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3164; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3165; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5,6],ymm10[7] 3166; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] 3167; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3168; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3169; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] 3170; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] 3171; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3172; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3173; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 3174; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3175; AVX2-FP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3176; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3177; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1 3178; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3179; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm3 3180; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3181; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm8 3182; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7] 3183; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1],ymm8[2],ymm4[3,4],ymm8[5],ymm4[6,7] 3184; AVX2-FP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3185; AVX2-FP-NEXT: vpermps %ymm4, %ymm0, %ymm4 3186; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3187; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1 3188; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3189; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2 3190; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3191; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3 3192; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3193; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 3194; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7] 3195; AVX2-FP-NEXT: vpermps %ymm6, %ymm0, %ymm6 3196; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3197; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm13 3198; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm12 3199; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm14 3200; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm14[0],ymm12[1],ymm14[2,3],ymm12[4],ymm14[5,6],ymm12[7] 3201; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3202; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3203; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] 3204; AVX2-FP-NEXT: vpermps %ymm10, %ymm0, %ymm10 3205; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3206; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm6 3207; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm5 3208; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm7 3209; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm7[0],ymm5[1],ymm7[2,3],ymm5[4],ymm7[5,6],ymm5[7] 3210; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3211; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3212; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm6[2],ymm15[3,4],ymm6[5],ymm15[6,7] 3213; AVX2-FP-NEXT: vpermps %ymm15, %ymm0, %ymm0 3214; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3215; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 3216; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3217; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3,4],ymm4[5],ymm9[6,7] 3218; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload 3219; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0],ymm0[1,2],ymm11[3],ymm0[4,5],ymm11[6],ymm0[7] 3220; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6] 3221; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3222; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3223; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3224; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3225; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 3226; AVX2-FP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3227; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3228; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3229; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3230; AVX2-FP-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload 3231; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3232; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 3233; AVX2-FP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3234; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3235; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3236; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3237; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3238; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3239; AVX2-FP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 3240; AVX2-FP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3241; AVX2-FP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3242; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3243; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3244; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3245; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3246; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3247; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] 3248; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3249; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3250; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload 3251; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3252; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1],ymm3[2],ymm8[3,4],ymm3[5],ymm8[6,7] 3253; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload 3254; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0],ymm0[1,2],ymm10[3],ymm0[4,5],ymm10[6],ymm0[7] 3255; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3256; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3257; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm12[2],ymm14[3,4],ymm12[5],ymm14[6,7] 3258; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm0[1,2],ymm13[3],ymm0[4,5],ymm13[6],ymm0[7] 3259; AVX2-FP-NEXT: vmovaps %ymm13, %ymm14 3260; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm13 3261; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1],ymm5[2],ymm7[3,4],ymm5[5],ymm7[6,7] 3262; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0],ymm0[1,2],ymm6[3],ymm0[4,5],ymm6[6],ymm0[7] 3263; AVX2-FP-NEXT: vmovaps %ymm6, %ymm7 3264; AVX2-FP-NEXT: vpermps %ymm0, %ymm15, %ymm15 3265; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1],ymm9[2],ymm4[3,4],ymm9[5],ymm4[6,7] 3266; AVX2-FP-NEXT: vmovaps {{.*#+}} ymm12 = [2,5,0,3,6,u,u,u] 3267; AVX2-FP-NEXT: vpermps %ymm0, %ymm12, %ymm0 3268; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm11[0,1,0,3,4,5,4,7] 3269; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm9[0,1,0,3] 3270; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4],ymm9[5,6,7] 3271; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7] 3272; AVX2-FP-NEXT: vpermps %ymm0, %ymm12, %ymm0 3273; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload 3274; AVX2-FP-NEXT: # ymm1 = mem[0,1,0,3,4,5,4,7] 3275; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3] 3276; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7] 3277; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3278; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload 3279; AVX2-FP-NEXT: # ymm1 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 3280; AVX2-FP-NEXT: vpermps %ymm1, %ymm12, %ymm1 3281; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload 3282; AVX2-FP-NEXT: # ymm2 = mem[0,1,0,3,4,5,4,7] 3283; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3] 3284; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] 3285; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm8[2],ymm3[3,4],ymm8[5],ymm3[6,7] 3286; AVX2-FP-NEXT: vpermps %ymm2, %ymm12, %ymm2 3287; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm10[0,1,0,3,4,5,4,7] 3288; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,3] 3289; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5,6,7] 3290; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload 3291; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3292; AVX2-FP-NEXT: # ymm3 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 3293; AVX2-FP-NEXT: vpermps %ymm3, %ymm12, %ymm3 3294; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload 3295; AVX2-FP-NEXT: # ymm4 = mem[0,1,0,3,4,5,4,7] 3296; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3] 3297; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5,6,7] 3298; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3299; AVX2-FP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload 3300; AVX2-FP-NEXT: # ymm4 = mem[0,1],ymm4[2],mem[3,4],ymm4[5],mem[6,7] 3301; AVX2-FP-NEXT: vpermps %ymm4, %ymm12, %ymm4 3302; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[0,1,0,3,4,5,4,7] 3303; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm5[0,1,0,3] 3304; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7] 3305; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3306; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload 3307; AVX2-FP-NEXT: # ymm5 = ymm5[0,1],mem[2],ymm5[3,4],mem[5],ymm5[6,7] 3308; AVX2-FP-NEXT: vpermps %ymm5, %ymm12, %ymm5 3309; AVX2-FP-NEXT: vpermilps $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload 3310; AVX2-FP-NEXT: # ymm6 = mem[0,1,0,3,4,5,4,7] 3311; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm6[0,1,0,3] 3312; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm6[5,6,7] 3313; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 3314; AVX2-FP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 3315; AVX2-FP-NEXT: # ymm6 = ymm6[0,1],mem[2],ymm6[3,4],mem[5],ymm6[6,7] 3316; AVX2-FP-NEXT: vpermps %ymm6, %ymm12, %ymm6 3317; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,0,3,4,5,4,7] 3318; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm7[0,1,0,3] 3319; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5,6,7] 3320; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3321; AVX2-FP-NEXT: vmovaps %ymm7, 192(%rsi) 3322; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3323; AVX2-FP-NEXT: vmovaps %ymm7, 128(%rsi) 3324; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3325; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rsi) 3326; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3327; AVX2-FP-NEXT: vmovaps %ymm7, (%rsi) 3328; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3329; AVX2-FP-NEXT: vmovaps %ymm7, 224(%rsi) 3330; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3331; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rsi) 3332; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3333; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rsi) 3334; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3335; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rsi) 3336; AVX2-FP-NEXT: vmovaps %ymm15, 192(%rdx) 3337; AVX2-FP-NEXT: vmovaps %ymm13, 128(%rdx) 3338; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3339; AVX2-FP-NEXT: vmovaps %ymm7, 64(%rdx) 3340; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3341; AVX2-FP-NEXT: vmovaps %ymm7, (%rdx) 3342; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3343; AVX2-FP-NEXT: vmovaps %ymm7, 224(%rdx) 3344; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3345; AVX2-FP-NEXT: vmovaps %ymm7, 160(%rdx) 3346; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3347; AVX2-FP-NEXT: vmovaps %ymm7, 96(%rdx) 3348; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload 3349; AVX2-FP-NEXT: vmovaps %ymm7, 32(%rdx) 3350; AVX2-FP-NEXT: vmovaps %ymm6, 192(%rcx) 3351; AVX2-FP-NEXT: vmovaps %ymm5, 224(%rcx) 3352; AVX2-FP-NEXT: vmovaps %ymm4, 128(%rcx) 3353; AVX2-FP-NEXT: vmovaps %ymm3, 160(%rcx) 3354; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx) 3355; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx) 3356; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx) 3357; AVX2-FP-NEXT: vmovaps %ymm9, 32(%rcx) 3358; AVX2-FP-NEXT: addq $1032, %rsp # imm = 0x408 3359; AVX2-FP-NEXT: vzeroupper 3360; AVX2-FP-NEXT: retq 3361; 3362; AVX2-FCP-LABEL: load_i32_stride3_vf64: 3363; AVX2-FCP: # %bb.0: 3364; AVX2-FCP-NEXT: subq $1032, %rsp # imm = 0x408 3365; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2 3366; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3367; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3 3368; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3369; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4 3370; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3371; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm5 3372; AVX2-FCP-NEXT: vmovups %ymm5, (%rsp) # 32-byte Spill 3373; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm6 3374; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3375; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm7 3376; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3377; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm8 3378; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3379; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm9 3380; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm10 3381; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3382; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm11 3383; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3384; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm14 3385; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm13 3386; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0],ymm14[1],ymm13[2,3],ymm14[4],ymm13[5,6],ymm14[7] 3387; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm11[2],ymm0[3,4],ymm11[5],ymm0[6,7] 3388; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm0 = [0,3,6,1,4,7,2,5] 3389; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3390; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3391; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm10[0],ymm9[1],ymm10[2,3],ymm9[4],ymm10[5,6],ymm9[7] 3392; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm8[2],ymm1[3,4],ymm8[5],ymm1[6,7] 3393; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3394; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3395; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6],ymm6[7] 3396; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm5[2],ymm1[3,4],ymm5[5],ymm1[6,7] 3397; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3398; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3399; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7] 3400; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3401; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3402; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3403; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm5 3404; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm1 3405; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3406; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm3 3407; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3408; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2,3],ymm1[4],ymm5[5,6],ymm1[7] 3409; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3410; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3,4],ymm3[5],ymm1[6,7] 3411; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3412; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3413; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm2 3414; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3415; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm4 3416; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1 3417; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3418; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7] 3419; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3420; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7] 3421; AVX2-FCP-NEXT: vpermps %ymm1, %ymm0, %ymm1 3422; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3423; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm1 3424; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3425; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm2 3426; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3427; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm3 3428; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3429; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6],ymm2[7] 3430; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm1[2],ymm3[3,4],ymm1[5],ymm3[6,7] 3431; AVX2-FCP-NEXT: vpermps %ymm3, %ymm0, %ymm1 3432; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3433; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm8 3434; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm7 3435; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm11 3436; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm11[0],ymm7[1],ymm11[2,3],ymm7[4],ymm11[5,6],ymm7[7] 3437; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3438; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3439; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1],ymm8[2],ymm15[3,4],ymm8[5],ymm15[6,7] 3440; AVX2-FCP-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3441; AVX2-FCP-NEXT: vpermps %ymm15, %ymm0, %ymm0 3442; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3443; AVX2-FCP-NEXT: vmovaps %ymm14, %ymm10 3444; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7] 3445; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload 3446; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0],ymm0[1,2],ymm12[3],ymm0[4,5],ymm12[6],ymm0[7] 3447; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm15 = [1,4,7,2,5,0,3,6] 3448; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3449; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3450; AVX2-FCP-NEXT: vmovaps %ymm9, %ymm14 3451; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload 3452; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm9[2],mem[3,4],ymm9[5],mem[6,7] 3453; AVX2-FCP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3454; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3455; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3456; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3457; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3458; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3459; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 3460; AVX2-FCP-NEXT: vblendps $73, (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload 3461; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3462; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3463; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3464; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload 3465; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3466; AVX2-FCP-NEXT: # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7] 3467; AVX2-FCP-NEXT: vblendps $73, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload 3468; AVX2-FCP-NEXT: # ymm0 = mem[0],ymm0[1,2],mem[3],ymm0[4,5],mem[6],ymm0[7] 3469; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3470; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3471; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3472; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm1[2],ymm5[3,4],ymm1[5],ymm5[6,7] 3473; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload 3474; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0],ymm0[1,2],ymm2[3],ymm0[4,5],ymm2[6],ymm0[7] 3475; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3476; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3477; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3478; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7] 3479; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload 3480; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0],ymm0[1,2],ymm4[3],ymm0[4,5],ymm4[6],ymm0[7] 3481; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3482; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3483; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload 3484; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3485; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7] 3486; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload 3487; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0],ymm0[1,2],ymm9[3],ymm0[4,5],ymm9[6],ymm0[7] 3488; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm0 3489; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill 3490; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm11[0,1],ymm7[2],ymm11[3,4],ymm7[5],ymm11[6,7] 3491; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0],ymm0[1,2],ymm8[3],ymm0[4,5],ymm8[6],ymm0[7] 3492; AVX2-FCP-NEXT: vpermps %ymm0, %ymm15, %ymm11 3493; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7] 3494; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm12[1],ymm0[2,3],ymm12[4],ymm0[5,6],ymm12[7] 3495; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload 3496; AVX2-FCP-NEXT: # ymm8 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 3497; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm8[0],ymm2[1],ymm8[2,3],ymm2[4],ymm8[5,6],ymm2[7] 3498; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm8 # 32-byte Folded Reload 3499; AVX2-FCP-NEXT: # ymm8 = ymm14[0,1],mem[2],ymm14[3,4],mem[5],ymm14[6,7] 3500; AVX2-FCP-NEXT: vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload 3501; AVX2-FCP-NEXT: # ymm8 = ymm8[0],mem[1],ymm8[2,3],mem[4],ymm8[5,6],mem[7] 3502; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm2 # 32-byte Folded Reload 3503; AVX2-FCP-NEXT: # ymm2 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] 3504; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[2,3],ymm4[4],ymm2[5,6],ymm4[7] 3505; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload 3506; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload 3507; AVX2-FCP-NEXT: # ymm7 = ymm1[0,1],mem[2],ymm1[3,4],mem[5],ymm1[6,7] 3508; AVX2-FCP-NEXT: vblendps $146, (%rsp), %ymm7, %ymm7 # 32-byte Folded Reload 3509; AVX2-FCP-NEXT: # ymm7 = ymm7[0],mem[1],ymm7[2,3],mem[4],ymm7[5,6],mem[7] 3510; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7] 3511; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0],ymm9[1],ymm1[2,3],ymm9[4],ymm1[5,6],ymm9[7] 3512; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3513; AVX2-FCP-NEXT: vblendps $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm6 # 32-byte Folded Reload 3514; AVX2-FCP-NEXT: # ymm6 = ymm3[0,1],mem[2],ymm3[3,4],mem[5],ymm3[6,7] 3515; AVX2-FCP-NEXT: vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm6 # 32-byte Folded Reload 3516; AVX2-FCP-NEXT: # ymm6 = ymm6[0],mem[1],ymm6[2,3],mem[4],ymm6[5,6],mem[7] 3517; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload 3518; AVX2-FCP-NEXT: vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3519; AVX2-FCP-NEXT: # ymm3 = mem[0,1],ymm3[2],mem[3,4],ymm3[5],mem[6,7] 3520; AVX2-FCP-NEXT: vblendps $146, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload 3521; AVX2-FCP-NEXT: # ymm3 = ymm3[0],mem[1],ymm3[2,3],mem[4],ymm3[5,6],mem[7] 3522; AVX2-FCP-NEXT: vmovaps {{.*#+}} ymm5 = [2,5,0,3,6,1,4,7] 3523; AVX2-FCP-NEXT: vpermps %ymm0, %ymm5, %ymm0 3524; AVX2-FCP-NEXT: vpermps %ymm15, %ymm5, %ymm4 3525; AVX2-FCP-NEXT: vpermps %ymm8, %ymm5, %ymm8 3526; AVX2-FCP-NEXT: vpermps %ymm2, %ymm5, %ymm2 3527; AVX2-FCP-NEXT: vpermps %ymm7, %ymm5, %ymm7 3528; AVX2-FCP-NEXT: vpermps %ymm1, %ymm5, %ymm1 3529; AVX2-FCP-NEXT: vpermps %ymm6, %ymm5, %ymm6 3530; AVX2-FCP-NEXT: vpermps %ymm3, %ymm5, %ymm3 3531; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3532; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rsi) 3533; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3534; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rsi) 3535; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3536; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rsi) 3537; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3538; AVX2-FCP-NEXT: vmovaps %ymm5, (%rsi) 3539; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3540; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rsi) 3541; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3542; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rsi) 3543; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3544; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rsi) 3545; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3546; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rsi) 3547; AVX2-FCP-NEXT: vmovaps %ymm11, 192(%rdx) 3548; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3549; AVX2-FCP-NEXT: vmovaps %ymm5, 128(%rdx) 3550; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3551; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rdx) 3552; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3553; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx) 3554; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3555; AVX2-FCP-NEXT: vmovaps %ymm5, 224(%rdx) 3556; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3557; AVX2-FCP-NEXT: vmovaps %ymm5, 160(%rdx) 3558; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3559; AVX2-FCP-NEXT: vmovaps %ymm5, 96(%rdx) 3560; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload 3561; AVX2-FCP-NEXT: vmovaps %ymm5, 32(%rdx) 3562; AVX2-FCP-NEXT: vmovaps %ymm3, 192(%rcx) 3563; AVX2-FCP-NEXT: vmovaps %ymm6, 224(%rcx) 3564; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx) 3565; AVX2-FCP-NEXT: vmovaps %ymm7, 160(%rcx) 3566; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx) 3567; AVX2-FCP-NEXT: vmovaps %ymm8, 96(%rcx) 3568; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx) 3569; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rcx) 3570; AVX2-FCP-NEXT: addq $1032, %rsp # imm = 0x408 3571; AVX2-FCP-NEXT: vzeroupper 3572; AVX2-FCP-NEXT: retq 3573; 3574; AVX512-LABEL: load_i32_stride3_vf64: 3575; AVX512: # %bb.0: 3576; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm4 3577; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm5 3578; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm0 3579; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm6 3580; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7 3581; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm1 3582; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm8 3583; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm9 3584; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2 3585; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm10 3586; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm11 3587; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3 3588; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 3589; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13 3590; AVX512-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 3591; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 3592; AVX512-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 3593; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15 3594; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 3595; AVX512-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 3596; AVX512-NEXT: vmovdqa64 %zmm0, %zmm16 3597; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 3598; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 3599; AVX512-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 3600; AVX512-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 3601; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 3602; AVX512-NEXT: vmovdqa64 %zmm7, %zmm17 3603; AVX512-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 3604; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 3605; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 3606; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19 3607; AVX512-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 3608; AVX512-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 3609; AVX512-NEXT: vmovdqa64 %zmm5, %zmm20 3610; AVX512-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 3611; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 3612; AVX512-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 3613; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 3614; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 3615; AVX512-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 3616; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 3617; AVX512-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 3618; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 3619; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 3620; AVX512-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 3621; AVX512-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 3622; AVX512-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 3623; AVX512-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 3624; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rsi) 3625; AVX512-NEXT: vmovdqa64 %zmm15, 128(%rsi) 3626; AVX512-NEXT: vmovdqa64 %zmm13, 64(%rsi) 3627; AVX512-NEXT: vmovdqa64 %zmm12, (%rsi) 3628; AVX512-NEXT: vmovdqa64 %zmm20, 192(%rdx) 3629; AVX512-NEXT: vmovdqa64 %zmm14, (%rdx) 3630; AVX512-NEXT: vmovdqa64 %zmm19, 64(%rdx) 3631; AVX512-NEXT: vmovdqa64 %zmm17, 128(%rdx) 3632; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rcx) 3633; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rcx) 3634; AVX512-NEXT: vmovdqa64 %zmm2, (%rcx) 3635; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rcx) 3636; AVX512-NEXT: vzeroupper 3637; AVX512-NEXT: retq 3638; 3639; AVX512-FCP-LABEL: load_i32_stride3_vf64: 3640; AVX512-FCP: # %bb.0: 3641; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 3642; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 3643; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 3644; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6 3645; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 3646; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 3647; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 3648; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 3649; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 3650; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 3651; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 3652; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 3653; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 3654; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 3655; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 3656; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 3657; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 3658; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 3659; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 3660; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 3661; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 3662; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 3663; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 3664; AVX512-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 3665; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 3666; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 3667; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 3668; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 3669; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 3670; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 3671; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 3672; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 3673; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 3674; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 3675; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 3676; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 3677; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 3678; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 3679; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 3680; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 3681; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 3682; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 3683; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 3684; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 3685; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 3686; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 3687; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 3688; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 3689; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi) 3690; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) 3691; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) 3692; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) 3693; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx) 3694; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 3695; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) 3696; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx) 3697; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx) 3698; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx) 3699; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 3700; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) 3701; AVX512-FCP-NEXT: vzeroupper 3702; AVX512-FCP-NEXT: retq 3703; 3704; AVX512DQ-LABEL: load_i32_stride3_vf64: 3705; AVX512DQ: # %bb.0: 3706; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm4 3707; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm5 3708; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm0 3709; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm6 3710; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7 3711; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm1 3712; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm8 3713; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm9 3714; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2 3715; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm10 3716; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm11 3717; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3 3718; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 3719; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13 3720; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 3721; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 3722; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 3723; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15 3724; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 3725; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 3726; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm16 3727; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 3728; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 3729; AVX512DQ-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 3730; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 3731; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 3732; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm17 3733; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 3734; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 3735; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 3736; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19 3737; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 3738; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 3739; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm20 3740; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 3741; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 3742; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 3743; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 3744; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 3745; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 3746; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 3747; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 3748; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 3749; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 3750; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 3751; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 3752; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 3753; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 3754; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rsi) 3755; AVX512DQ-NEXT: vmovdqa64 %zmm15, 128(%rsi) 3756; AVX512DQ-NEXT: vmovdqa64 %zmm13, 64(%rsi) 3757; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rsi) 3758; AVX512DQ-NEXT: vmovdqa64 %zmm20, 192(%rdx) 3759; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%rdx) 3760; AVX512DQ-NEXT: vmovdqa64 %zmm19, 64(%rdx) 3761; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%rdx) 3762; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rcx) 3763; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rcx) 3764; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rcx) 3765; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rcx) 3766; AVX512DQ-NEXT: vzeroupper 3767; AVX512DQ-NEXT: retq 3768; 3769; AVX512DQ-FCP-LABEL: load_i32_stride3_vf64: 3770; AVX512DQ-FCP: # %bb.0: 3771; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 3772; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 3773; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 3774; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6 3775; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 3776; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 3777; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 3778; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 3779; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 3780; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 3781; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 3782; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 3783; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 3784; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 3785; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 3786; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 3787; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 3788; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 3789; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 3790; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 3791; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 3792; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 3793; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 3794; AVX512DQ-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 3795; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 3796; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 3797; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 3798; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 3799; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 3800; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 3801; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 3802; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 3803; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 3804; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 3805; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 3806; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 3807; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 3808; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 3809; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 3810; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 3811; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 3812; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 3813; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 3814; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 3815; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 3816; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 3817; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 3818; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 3819; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi) 3820; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) 3821; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) 3822; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) 3823; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx) 3824; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 3825; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) 3826; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx) 3827; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx) 3828; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx) 3829; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 3830; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) 3831; AVX512DQ-FCP-NEXT: vzeroupper 3832; AVX512DQ-FCP-NEXT: retq 3833; 3834; AVX512BW-LABEL: load_i32_stride3_vf64: 3835; AVX512BW: # %bb.0: 3836; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm4 3837; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm5 3838; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm0 3839; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm6 3840; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7 3841; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm1 3842; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm8 3843; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm9 3844; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm2 3845; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm10 3846; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm11 3847; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3 3848; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 3849; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13 3850; AVX512BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 3851; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 3852; AVX512BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 3853; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15 3854; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 3855; AVX512BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 3856; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 3857; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 3858; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 3859; AVX512BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 3860; AVX512BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 3861; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 3862; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm17 3863; AVX512BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 3864; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 3865; AVX512BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 3866; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19 3867; AVX512BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 3868; AVX512BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 3869; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm20 3870; AVX512BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 3871; AVX512BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 3872; AVX512BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 3873; AVX512BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 3874; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 3875; AVX512BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 3876; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 3877; AVX512BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 3878; AVX512BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 3879; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 3880; AVX512BW-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 3881; AVX512BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 3882; AVX512BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 3883; AVX512BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 3884; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rsi) 3885; AVX512BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) 3886; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) 3887; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rsi) 3888; AVX512BW-NEXT: vmovdqa64 %zmm20, 192(%rdx) 3889; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rdx) 3890; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rdx) 3891; AVX512BW-NEXT: vmovdqa64 %zmm17, 128(%rdx) 3892; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%rcx) 3893; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rcx) 3894; AVX512BW-NEXT: vmovdqa64 %zmm2, (%rcx) 3895; AVX512BW-NEXT: vmovdqa64 %zmm3, 64(%rcx) 3896; AVX512BW-NEXT: vzeroupper 3897; AVX512BW-NEXT: retq 3898; 3899; AVX512BW-FCP-LABEL: load_i32_stride3_vf64: 3900; AVX512BW-FCP: # %bb.0: 3901; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 3902; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 3903; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 3904; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6 3905; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 3906; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 3907; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 3908; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 3909; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 3910; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 3911; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 3912; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 3913; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 3914; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 3915; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 3916; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 3917; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 3918; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 3919; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 3920; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 3921; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 3922; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 3923; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 3924; AVX512BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 3925; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 3926; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 3927; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 3928; AVX512BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 3929; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 3930; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 3931; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 3932; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 3933; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 3934; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 3935; AVX512BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 3936; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 3937; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 3938; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 3939; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 3940; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 3941; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 3942; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 3943; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 3944; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 3945; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 3946; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 3947; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 3948; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 3949; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi) 3950; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) 3951; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) 3952; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) 3953; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx) 3954; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 3955; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) 3956; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx) 3957; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx) 3958; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx) 3959; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 3960; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) 3961; AVX512BW-FCP-NEXT: vzeroupper 3962; AVX512BW-FCP-NEXT: retq 3963; 3964; AVX512DQ-BW-LABEL: load_i32_stride3_vf64: 3965; AVX512DQ-BW: # %bb.0: 3966; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm4 3967; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm5 3968; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm0 3969; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm6 3970; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7 3971; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm1 3972; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm8 3973; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm9 3974; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm2 3975; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm10 3976; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm11 3977; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3 3978; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 3979; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13 3980; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 3981; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 3982; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 3983; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15 3984; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 3985; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 3986; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 3987; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 3988; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 3989; AVX512DQ-BW-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 3990; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 3991; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 3992; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm17 3993; AVX512DQ-BW-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 3994; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 3995; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 3996; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19 3997; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 3998; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 3999; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm20 4000; AVX512DQ-BW-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 4001; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 4002; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 4003; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 4004; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 4005; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 4006; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 4007; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 4008; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 4009; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 4010; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 4011; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 4012; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 4013; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 4014; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rsi) 4015; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 128(%rsi) 4016; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rsi) 4017; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rsi) 4018; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 192(%rdx) 4019; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rdx) 4020; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rdx) 4021; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 128(%rdx) 4022; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%rcx) 4023; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rcx) 4024; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, (%rcx) 4025; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 64(%rcx) 4026; AVX512DQ-BW-NEXT: vzeroupper 4027; AVX512DQ-BW-NEXT: retq 4028; 4029; AVX512DQ-BW-FCP-LABEL: load_i32_stride3_vf64: 4030; AVX512DQ-BW-FCP: # %bb.0: 4031; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm4 4032; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm5 4033; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm0 4034; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm6 4035; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7 4036; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm1 4037; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm8 4038; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm9 4039; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm2 4040; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm10 4041; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm11 4042; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3 4043; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,0,0,0,0,0] 4044; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13 4045; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm12, %zmm13 4046; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29] 4047; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm14, %zmm13 4048; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15 4049; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm15 4050; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm14, %zmm15 4051; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 4052; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm16 4053; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm16 4054; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm10, %zmm2, %zmm12 4055; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm14, %zmm12 4056; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,0,0,0,0,0] 4057; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm17 4058; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm1, %zmm14, %zmm17 4059; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30] 4060; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm18, %zmm17 4061; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19 4062; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm14, %zmm19 4063; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm18, %zmm19 4064; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm20 4065; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm0, %zmm14, %zmm20 4066; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm20 4067; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm10, %zmm14 4068; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm14 4069; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,0,0,0,0,0,0] 4070; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm18, %zmm3 4071; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31] 4072; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm9, %zmm3 4073; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm18, %zmm0 4074; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm0 4075; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm18, %zmm1 4076; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm9, %zmm1 4077; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm18, %zmm2 4078; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm9, %zmm2 4079; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rsi) 4080; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 128(%rsi) 4081; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rsi) 4082; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rsi) 4083; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 192(%rdx) 4084; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rdx) 4085; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rdx) 4086; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 128(%rdx) 4087; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%rcx) 4088; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rcx) 4089; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rcx) 4090; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 64(%rcx) 4091; AVX512DQ-BW-FCP-NEXT: vzeroupper 4092; AVX512DQ-BW-FCP-NEXT: retq 4093 %wide.vec = load <192 x i32>, ptr %in.vec, align 64 4094 %strided.vec0 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45, i32 48, i32 51, i32 54, i32 57, i32 60, i32 63, i32 66, i32 69, i32 72, i32 75, i32 78, i32 81, i32 84, i32 87, i32 90, i32 93, i32 96, i32 99, i32 102, i32 105, i32 108, i32 111, i32 114, i32 117, i32 120, i32 123, i32 126, i32 129, i32 132, i32 135, i32 138, i32 141, i32 144, i32 147, i32 150, i32 153, i32 156, i32 159, i32 162, i32 165, i32 168, i32 171, i32 174, i32 177, i32 180, i32 183, i32 186, i32 189> 4095 %strided.vec1 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22, i32 25, i32 28, i32 31, i32 34, i32 37, i32 40, i32 43, i32 46, i32 49, i32 52, i32 55, i32 58, i32 61, i32 64, i32 67, i32 70, i32 73, i32 76, i32 79, i32 82, i32 85, i32 88, i32 91, i32 94, i32 97, i32 100, i32 103, i32 106, i32 109, i32 112, i32 115, i32 118, i32 121, i32 124, i32 127, i32 130, i32 133, i32 136, i32 139, i32 142, i32 145, i32 148, i32 151, i32 154, i32 157, i32 160, i32 163, i32 166, i32 169, i32 172, i32 175, i32 178, i32 181, i32 184, i32 187, i32 190> 4096 %strided.vec2 = shufflevector <192 x i32> %wide.vec, <192 x i32> poison, <64 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23, i32 26, i32 29, i32 32, i32 35, i32 38, i32 41, i32 44, i32 47, i32 50, i32 53, i32 56, i32 59, i32 62, i32 65, i32 68, i32 71, i32 74, i32 77, i32 80, i32 83, i32 86, i32 89, i32 92, i32 95, i32 98, i32 101, i32 104, i32 107, i32 110, i32 113, i32 116, i32 119, i32 122, i32 125, i32 128, i32 131, i32 134, i32 137, i32 140, i32 143, i32 146, i32 149, i32 152, i32 155, i32 158, i32 161, i32 164, i32 167, i32 170, i32 173, i32 176, i32 179, i32 182, i32 185, i32 188, i32 191> 4097 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64 4098 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64 4099 store <64 x i32> %strided.vec2, ptr %out.vec2, align 64 4100 ret void 4101} 4102