1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSE 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 8 9; PR21281 10define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d, <8 x i16> %e, <8 x i16> %f, <8 x i16> %h, <8 x i16> %g) { 11; SSE-LABEL: interleave8x8: 12; SSE: # %bb.0: 13; SSE-NEXT: movq %rdi, %rax 14; SSE-NEXT: movdqa %xmm0, %xmm8 15; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] 16; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 17; SSE-NEXT: movdqa %xmm2, %xmm9 18; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3] 19; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 20; SSE-NEXT: movdqa %xmm0, %xmm3 21; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm2[2],xmm3[3],xmm2[3] 22; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 23; SSE-NEXT: movdqa %xmm8, %xmm1 24; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm9[2],xmm1[3],xmm9[3] 25; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1] 26; SSE-NEXT: movdqa %xmm4, %xmm2 27; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] 28; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 29; SSE-NEXT: movdqa %xmm7, %xmm5 30; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 31; SSE-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 32; SSE-NEXT: movdqa %xmm4, %xmm6 33; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3] 34; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1] 35; SSE-NEXT: movdqa %xmm2, %xmm7 36; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm5[2],xmm7[3],xmm5[3] 37; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 38; SSE-NEXT: movdqa %xmm8, %xmm5 39; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 40; SSE-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] 41; SSE-NEXT: movdqa %xmm1, %xmm2 42; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] 43; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7] 44; SSE-NEXT: movdqa %xmm0, %xmm7 45; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] 46; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 47; SSE-NEXT: movdqa %xmm3, %xmm4 48; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] 49; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] 50; SSE-NEXT: movdqa %xmm3, 112(%rdi) 51; SSE-NEXT: movdqa %xmm4, 96(%rdi) 52; SSE-NEXT: movdqa %xmm0, 80(%rdi) 53; SSE-NEXT: movdqa %xmm7, 64(%rdi) 54; SSE-NEXT: movdqa %xmm1, 48(%rdi) 55; SSE-NEXT: movdqa %xmm2, 32(%rdi) 56; SSE-NEXT: movdqa %xmm8, 16(%rdi) 57; SSE-NEXT: movdqa %xmm5, (%rdi) 58; SSE-NEXT: retq 59; 60; AVX1-LABEL: interleave8x8: 61; AVX1: # %bb.0: 62; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 63; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 64; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 65; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 66; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3] 67; AVX1-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 68; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm8[2],xmm1[2],xmm8[3],xmm1[3] 69; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1] 70; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 71; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 72; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 73; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 74; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3] 75; AVX1-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1] 76; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] 77; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 78; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 79; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 80; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0 81; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] 82; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3] 83; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1 84; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] 85; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] 86; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2 87; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 88; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] 89; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3 90; AVX1-NEXT: retq 91; 92; AVX2-LABEL: interleave8x8: 93; AVX2: # %bb.0: 94; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 95; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 96; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0 97; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 98; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 99; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 100; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 101; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 102; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm0[2,3],ymm2[2,3] 103; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[0,1],ymm2[0,1] 104; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] 105; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 106; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1 107; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] 108; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3] 109; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 110; AVX2-NEXT: vpunpckhdq {{.*#+}} ymm4 = ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[6],ymm2[6],ymm1[7],ymm2[7] 111; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5] 112; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[2,3],ymm4[2,3] 113; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[0,1],ymm4[0,1] 114; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 115; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 116; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm4[0,1] 117; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3] 118; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] 119; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] 120; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm3[0,1],ymm4[0,1] 121; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm3[2,3],ymm4[2,3] 122; AVX2-NEXT: retq 123 %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 124 %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 125 %ab32 = bitcast <16 x i16> %ab to <8 x i32> 126 %cd32 = bitcast <16 x i16> %cd to <8 x i32> 127 %abcd32 = shufflevector <8 x i32> %ab32, <8 x i32> %cd32, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 128 %abcd = bitcast <16 x i32> %abcd32 to <32 x i16> 129 130 %ef = shufflevector <8 x i16> %e, <8 x i16> %f, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 131 %gh = shufflevector <8 x i16> %g, <8 x i16> %h, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 132 %ef32 = bitcast <16 x i16> %ef to <8 x i32> 133 %gh32 = bitcast <16 x i16> %gh to <8 x i32> 134 %efgh32 = shufflevector <8 x i32> %ef32, <8 x i32> %gh32, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 135 %efgh = bitcast <16 x i32> %efgh32 to <32 x i16> 136 137 %result = shufflevector <32 x i16> %abcd, <32 x i16> %efgh, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 138 ret <64 x i16> %result 139} 140 141define <8 x double> @interleave2x4f64(<4 x double> %a, <4 x double> %b) { 142; SSE-LABEL: interleave2x4f64: 143; SSE: # %bb.0: 144; SSE-NEXT: movaps %xmm0, %xmm4 145; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 146; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1] 147; SSE-NEXT: movaps %xmm1, %xmm2 148; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 149; SSE-NEXT: movhlps {{.*#+}} xmm3 = xmm1[1],xmm3[1] 150; SSE-NEXT: movaps %xmm4, %xmm1 151; SSE-NEXT: retq 152; 153; AVX1-LABEL: interleave2x4f64: 154; AVX1: # %bb.0: 155; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3] 156; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 157; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3] 158; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] 159; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 160; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 161; AVX1-NEXT: vmovapd %ymm2, %ymm1 162; AVX1-NEXT: retq 163; 164; AVX2-LABEL: interleave2x4f64: 165; AVX2: # %bb.0: 166; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] 167; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] 168; AVX2-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2],ymm2[3] 169; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,1,2,3] 170; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,2,3] 171; AVX2-NEXT: vshufpd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[3],ymm1[3] 172; AVX2-NEXT: vmovapd %ymm2, %ymm0 173; AVX2-NEXT: retq 174 %result = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 175 ret <8 x double> %result 176} 177 178define <8 x i64> @interleave2x4i64(<4 x i64> %a, <4 x i64> %b) { 179; SSE-LABEL: interleave2x4i64: 180; SSE: # %bb.0: 181; SSE-NEXT: movaps %xmm1, %xmm4 182; SSE-NEXT: movaps %xmm0, %xmm1 183; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0] 184; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1] 185; SSE-NEXT: movaps %xmm4, %xmm2 186; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm3[0] 187; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1] 188; SSE-NEXT: movaps %xmm4, %xmm3 189; SSE-NEXT: retq 190; 191; AVX1-LABEL: interleave2x4i64: 192; AVX1: # %bb.0: 193; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,2,3] 194; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3,2,3] 195; AVX1-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[3],ymm2[3] 196; AVX1-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm0[1],xmm1[1] 197; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 198; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 199; AVX1-NEXT: vmovapd %ymm2, %ymm1 200; AVX1-NEXT: retq 201; 202; AVX2-LABEL: interleave2x4i64: 203; AVX2: # %bb.0: 204; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm1[0,0,2,1] 205; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm0[0,1,1,3] 206; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] 207; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[0,2,2,3] 208; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,1,3,3] 209; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] 210; AVX2-NEXT: vmovaps %ymm2, %ymm0 211; AVX2-NEXT: retq 212 %result = shufflevector <4 x i64> %a, <4 x i64> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 213 ret <8 x i64> %result 214} 215 216define <16 x float> @interleave2x8f32(<8 x float> %a, <8 x float> %b) { 217; SSE-LABEL: interleave2x8f32: 218; SSE: # %bb.0: 219; SSE-NEXT: movaps %xmm1, %xmm4 220; SSE-NEXT: movaps %xmm0, %xmm1 221; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 222; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 223; SSE-NEXT: movaps %xmm4, %xmm2 224; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 225; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 226; SSE-NEXT: movaps %xmm4, %xmm3 227; SSE-NEXT: retq 228; 229; AVX1-LABEL: interleave2x8f32: 230; AVX1: # %bb.0: 231; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 232; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 233; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 234; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 235; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 236; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 237; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 238; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 239; AVX1-NEXT: vmovaps %ymm2, %ymm0 240; AVX1-NEXT: retq 241; 242; AVX2-LABEL: interleave2x8f32: 243; AVX2: # %bb.0: 244; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 245; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 246; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] 247; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 248; AVX2-NEXT: retq 249 %result = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 250 ret <16 x float> %result 251} 252 253define <16 x i32> @interleave2x8i32(<8 x i32> %a, <8 x i32> %b) { 254; SSE-LABEL: interleave2x8i32: 255; SSE: # %bb.0: 256; SSE-NEXT: movaps %xmm1, %xmm4 257; SSE-NEXT: movaps %xmm0, %xmm1 258; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 259; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3] 260; SSE-NEXT: movaps %xmm4, %xmm2 261; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] 262; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] 263; SSE-NEXT: movaps %xmm4, %xmm3 264; SSE-NEXT: retq 265; 266; AVX1-LABEL: interleave2x8i32: 267; AVX1: # %bb.0: 268; AVX1-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 269; AVX1-NEXT: vunpcklps {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 270; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 271; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 272; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 273; AVX1-NEXT: vunpckhps {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 274; AVX1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 275; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 276; AVX1-NEXT: vmovaps %ymm2, %ymm0 277; AVX1-NEXT: retq 278; 279; AVX2-LABEL: interleave2x8i32: 280; AVX2: # %bb.0: 281; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7] 282; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5] 283; AVX2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] 284; AVX2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 285; AVX2-NEXT: retq 286 %result = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 287 ret <16 x i32> %result 288} 289 290define <32 x i16> @interleave2x16i16(<16 x i16> %a, <16 x i16> %b) { 291; SSE-LABEL: interleave2x16i16: 292; SSE: # %bb.0: 293; SSE-NEXT: movdqa %xmm1, %xmm4 294; SSE-NEXT: movdqa %xmm0, %xmm1 295; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 296; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] 297; SSE-NEXT: movdqa %xmm4, %xmm2 298; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] 299; SSE-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] 300; SSE-NEXT: movdqa %xmm4, %xmm3 301; SSE-NEXT: retq 302; 303; AVX1-LABEL: interleave2x16i16: 304; AVX1: # %bb.0: 305; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 306; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 307; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 308; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 309; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 310; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 311; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 312; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 313; AVX1-NEXT: vmovaps %ymm2, %ymm0 314; AVX1-NEXT: retq 315; 316; AVX2-LABEL: interleave2x16i16: 317; AVX2: # %bb.0: 318; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 319; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 320; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] 321; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 322; AVX2-NEXT: retq 323 %result = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 324 ret <32 x i16> %result 325} 326 327define <64 x i16> @interleave2x32i16(<32 x i16> %a, <32 x i16> %b) { 328; SSE-LABEL: interleave2x32i16: 329; SSE: # %bb.0: 330; SSE-NEXT: movq %rdi, %rax 331; SSE-NEXT: movdqa %xmm0, %xmm8 332; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] 333; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 334; SSE-NEXT: movdqa %xmm1, %xmm4 335; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] 336; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7] 337; SSE-NEXT: movdqa %xmm2, %xmm5 338; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] 339; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] 340; SSE-NEXT: movdqa %xmm3, %xmm6 341; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] 342; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] 343; SSE-NEXT: movdqa %xmm3, 112(%rdi) 344; SSE-NEXT: movdqa %xmm6, 96(%rdi) 345; SSE-NEXT: movdqa %xmm2, 80(%rdi) 346; SSE-NEXT: movdqa %xmm5, 64(%rdi) 347; SSE-NEXT: movdqa %xmm1, 48(%rdi) 348; SSE-NEXT: movdqa %xmm4, 32(%rdi) 349; SSE-NEXT: movdqa %xmm0, 16(%rdi) 350; SSE-NEXT: movdqa %xmm8, (%rdi) 351; SSE-NEXT: retq 352; 353; AVX1-LABEL: interleave2x32i16: 354; AVX1: # %bb.0: 355; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 356; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 357; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4 358; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 359; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 360; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 361; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 362; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5 363; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 364; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 365; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm2 366; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm0 367; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 368; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] 369; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 370; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3 371; AVX1-NEXT: vmovaps %ymm4, %ymm0 372; AVX1-NEXT: vmovaps %ymm5, %ymm1 373; AVX1-NEXT: retq 374; 375; AVX2-LABEL: interleave2x32i16: 376; AVX2: # %bb.0: 377; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] 378; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] 379; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm4[0,1] 380; AVX2-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm2[2,3],ymm4[2,3] 381; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15] 382; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11] 383; AVX2-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm1[0,1],ymm5[0,1] 384; AVX2-NEXT: vperm2i128 {{.*#+}} ymm3 = ymm1[2,3],ymm5[2,3] 385; AVX2-NEXT: vmovdqa %ymm4, %ymm1 386; AVX2-NEXT: retq 387 %result = shufflevector <32 x i16> %a, <32 x i16> %b, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 388 ret <64 x i16> %result 389} 390 391define <64 x i8> @interleave2x32i8(<32 x i8> %a, <32 x i8> %b) { 392; SSE-LABEL: interleave2x32i8: 393; SSE: # %bb.0: 394; SSE-NEXT: movdqa %xmm1, %xmm4 395; SSE-NEXT: movdqa %xmm0, %xmm1 396; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] 397; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15] 398; SSE-NEXT: movdqa %xmm4, %xmm2 399; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] 400; SSE-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15] 401; SSE-NEXT: movdqa %xmm4, %xmm3 402; SSE-NEXT: retq 403; 404; AVX1-LABEL: interleave2x32i8: 405; AVX1: # %bb.0: 406; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 407; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 408; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 409; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 410; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 411; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 412; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 413; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 414; AVX1-NEXT: vmovaps %ymm2, %ymm0 415; AVX1-NEXT: retq 416; 417; AVX2-LABEL: interleave2x32i8: 418; AVX2: # %bb.0: 419; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] 420; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] 421; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm1[0,1],ymm2[0,1] 422; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3] 423; AVX2-NEXT: retq 424 %result = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 425 ret <64 x i8> %result 426} 427 428define void @splat2_i8(ptr %s, ptr %d) { 429; SSE-LABEL: splat2_i8: 430; SSE: # %bb.0: 431; SSE-NEXT: movdqu (%rdi), %xmm0 432; SSE-NEXT: movdqu 16(%rdi), %xmm1 433; SSE-NEXT: movdqa %xmm0, %xmm2 434; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] 435; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 436; SSE-NEXT: movdqa %xmm1, %xmm3 437; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] 438; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 439; SSE-NEXT: movdqu %xmm1, 48(%rsi) 440; SSE-NEXT: movdqu %xmm3, 32(%rsi) 441; SSE-NEXT: movdqu %xmm0, 16(%rsi) 442; SSE-NEXT: movdqu %xmm2, (%rsi) 443; SSE-NEXT: retq 444; 445; AVX1-LABEL: splat2_i8: 446; AVX1: # %bb.0: 447; AVX1-NEXT: vmovdqu (%rdi), %xmm0 448; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 449; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 450; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 451; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 452; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] 453; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi) 454; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi) 455; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi) 456; AVX1-NEXT: vmovdqu %xmm2, (%rsi) 457; AVX1-NEXT: retq 458; 459; AVX2-LABEL: splat2_i8: 460; AVX2: # %bb.0: 461; AVX2-NEXT: vpermq $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] 462; AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] 463; AVX2-NEXT: vpunpckhbw {{.*#+}} ymm0 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] 464; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) 465; AVX2-NEXT: vmovdqu %ymm1, (%rsi) 466; AVX2-NEXT: vzeroupper 467; AVX2-NEXT: retq 468 %ld32 = load <32 x i8>, ptr %s, align 1 469 %cat = shufflevector <32 x i8> %ld32, <32 x i8> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 470 %cat2 = shufflevector <64 x i8> %cat, <64 x i8> undef, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> 471 store <64 x i8> %cat2, ptr %d, align 1 472 ret void 473} 474 475define void @splat2_i16(ptr %s, ptr %d) { 476; SSE-LABEL: splat2_i16: 477; SSE: # %bb.0: 478; SSE-NEXT: movdqu (%rdi), %xmm0 479; SSE-NEXT: movdqu 16(%rdi), %xmm1 480; SSE-NEXT: movdqa %xmm0, %xmm2 481; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 482; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 483; SSE-NEXT: movdqa %xmm1, %xmm3 484; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] 485; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 486; SSE-NEXT: movdqu %xmm1, 48(%rsi) 487; SSE-NEXT: movdqu %xmm3, 32(%rsi) 488; SSE-NEXT: movdqu %xmm0, 16(%rsi) 489; SSE-NEXT: movdqu %xmm2, (%rsi) 490; SSE-NEXT: retq 491; 492; AVX1-LABEL: splat2_i16: 493; AVX1: # %bb.0: 494; AVX1-NEXT: vmovdqu (%rdi), %xmm0 495; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1 496; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3] 497; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 498; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0,0,1,1,2,2,3,3] 499; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7] 500; AVX1-NEXT: vmovdqu %xmm1, 48(%rsi) 501; AVX1-NEXT: vmovdqu %xmm3, 32(%rsi) 502; AVX1-NEXT: vmovdqu %xmm0, 16(%rsi) 503; AVX1-NEXT: vmovdqu %xmm2, (%rsi) 504; AVX1-NEXT: retq 505; 506; AVX2-LABEL: splat2_i16: 507; AVX2: # %bb.0: 508; AVX2-NEXT: vpermq $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] 509; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11] 510; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15] 511; AVX2-NEXT: vmovdqu %ymm0, 32(%rsi) 512; AVX2-NEXT: vmovdqu %ymm1, (%rsi) 513; AVX2-NEXT: vzeroupper 514; AVX2-NEXT: retq 515 %ld32 = load <16 x i16>, ptr %s, align 1 516 %cat = shufflevector <16 x i16> %ld32, <16 x i16> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 517 %cat2 = shufflevector <32 x i16> %cat, <32 x i16> undef, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 518 store <32 x i16> %cat2, ptr %d, align 1 519 ret void 520} 521 522define void @splat2_i32(ptr %s, ptr %d) { 523; SSE-LABEL: splat2_i32: 524; SSE: # %bb.0: 525; SSE-NEXT: movdqu (%rdi), %xmm0 526; SSE-NEXT: movdqu 16(%rdi), %xmm1 527; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] 528; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] 529; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,0,1,1] 530; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] 531; SSE-NEXT: movdqu %xmm1, 48(%rsi) 532; SSE-NEXT: movdqu %xmm3, 32(%rsi) 533; SSE-NEXT: movdqu %xmm0, 16(%rsi) 534; SSE-NEXT: movdqu %xmm2, (%rsi) 535; SSE-NEXT: retq 536; 537; AVX1-LABEL: splat2_i32: 538; AVX1: # %bb.0: 539; AVX1-NEXT: vmovups (%rdi), %xmm0 540; AVX1-NEXT: vmovups 16(%rdi), %xmm1 541; AVX1-NEXT: vshufps {{.*#+}} xmm2 = xmm0[0,0,1,1] 542; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,2,3,3] 543; AVX1-NEXT: vshufps {{.*#+}} xmm3 = xmm1[0,0,1,1] 544; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,2,3,3] 545; AVX1-NEXT: vmovups %xmm1, 48(%rsi) 546; AVX1-NEXT: vmovups %xmm3, 32(%rsi) 547; AVX1-NEXT: vmovups %xmm0, 16(%rsi) 548; AVX1-NEXT: vmovups %xmm2, (%rsi) 549; AVX1-NEXT: retq 550; 551; AVX2-LABEL: splat2_i32: 552; AVX2: # %bb.0: 553; AVX2-NEXT: vpermpd $216, (%rdi), %ymm0 # ymm0 = mem[0,2,1,3] 554; AVX2-NEXT: vshufps {{.*#+}} ymm1 = ymm0[0,0,1,1,4,4,5,5] 555; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7] 556; AVX2-NEXT: vmovups %ymm0, 32(%rsi) 557; AVX2-NEXT: vmovups %ymm1, (%rsi) 558; AVX2-NEXT: vzeroupper 559; AVX2-NEXT: retq 560 %ld32 = load <8 x i32>, ptr %s, align 1 561 %cat = shufflevector <8 x i32> %ld32, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 562 %cat2 = shufflevector <16 x i32> %cat, <16 x i32> undef, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 563 store <16 x i32> %cat2, ptr %d, align 1 564 ret void 565} 566 567define void @splat2_i64(ptr %s, ptr %d) { 568; SSE-LABEL: splat2_i64: 569; SSE: # %bb.0: 570; SSE-NEXT: movdqu (%rdi), %xmm0 571; SSE-NEXT: movdqu 16(%rdi), %xmm1 572; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] 573; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 574; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,1,0,1] 575; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 576; SSE-NEXT: movdqu %xmm1, 48(%rsi) 577; SSE-NEXT: movdqu %xmm3, 32(%rsi) 578; SSE-NEXT: movdqu %xmm0, 16(%rsi) 579; SSE-NEXT: movdqu %xmm2, (%rsi) 580; SSE-NEXT: retq 581; 582; AVX1-LABEL: splat2_i64: 583; AVX1: # %bb.0: 584; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3] 585; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3] 586; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1] 587; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3] 588; AVX1-NEXT: vmovupd %ymm0, 32(%rsi) 589; AVX1-NEXT: vmovupd %ymm1, (%rsi) 590; AVX1-NEXT: vzeroupper 591; AVX1-NEXT: retq 592; 593; AVX2-LABEL: splat2_i64: 594; AVX2: # %bb.0: 595; AVX2-NEXT: vmovups (%rdi), %ymm0 596; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[0,0,1,1] 597; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,3,3] 598; AVX2-NEXT: vmovups %ymm0, 32(%rsi) 599; AVX2-NEXT: vmovups %ymm1, (%rsi) 600; AVX2-NEXT: vzeroupper 601; AVX2-NEXT: retq 602 %ld32 = load <4 x i64>, ptr %s, align 1 603 %cat = shufflevector <4 x i64> %ld32, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3> 604 %cat2 = shufflevector <8 x i64> %cat, <8 x i64> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 605 store <8 x i64> %cat2, ptr %d, align 1 606 ret void 607} 608