1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-SLOW 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-ALL 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX1OR2,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-PERLANE 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-SLOW 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST,AVX512VL-FAST-CROSSLANE 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VL-FAST,AVX512VL-FAST-PERLANE 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX1 10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=ALL,XOP,XOPAVX2 11 12define { <8 x i32>, <8 x i32> } @splitTransposeDecode_8_avx2(<16 x i16> %a, <16 x i16> %b) { 13; AVX1-LABEL: splitTransposeDecode_8_avx2: 14; AVX1: # %bb.0: 15; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 16; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 17; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 18; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 19; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 20; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 21; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 22; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 23; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 24; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 25; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 26; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 27; AVX1-NEXT: vmovaps %ymm2, %ymm0 28; AVX1-NEXT: retq 29; 30; AVX2-LABEL: splitTransposeDecode_8_avx2: 31; AVX2: # %bb.0: 32; AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 33; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] 34; AVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 35; AVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 36; AVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 37; AVX2-NEXT: vmovdqa %ymm2, %ymm0 38; AVX2-NEXT: retq 39; 40; AVX512VL-LABEL: splitTransposeDecode_8_avx2: 41; AVX512VL: # %bb.0: 42; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm2 = [0,16,8,24,1,17,9,25,2,18,10,26,3,19,11,27] 43; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm2 44; AVX512VL-NEXT: vpmovsxbw {{.*#+}} ymm3 = [4,20,12,28,5,21,13,29,6,22,14,30,7,23,15,31] 45; AVX512VL-NEXT: vpermi2w %ymm1, %ymm0, %ymm3 46; AVX512VL-NEXT: vmovdqa %ymm2, %ymm0 47; AVX512VL-NEXT: vmovdqa %ymm3, %ymm1 48; AVX512VL-NEXT: retq 49; 50; XOPAVX1-LABEL: splitTransposeDecode_8_avx2: 51; XOPAVX1: # %bb.0: 52; XOPAVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 53; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] 54; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 55; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] 56; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] 57; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 58; XOPAVX1-NEXT: vinsertf128 $1, %xmm6, %ymm2, %ymm2 59; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] 60; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] 61; XOPAVX1-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 62; XOPAVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 63; XOPAVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1 64; XOPAVX1-NEXT: vmovaps %ymm2, %ymm0 65; XOPAVX1-NEXT: retq 66; 67; XOPAVX2-LABEL: splitTransposeDecode_8_avx2: 68; XOPAVX2: # %bb.0: 69; XOPAVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] 70; XOPAVX2-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,4,1,5,2,6,3,7] 71; XOPAVX2-NEXT: vpermd %ymm2, %ymm3, %ymm2 72; XOPAVX2-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] 73; XOPAVX2-NEXT: vpermd %ymm0, %ymm3, %ymm1 74; XOPAVX2-NEXT: vmovdqa %ymm2, %ymm0 75; XOPAVX2-NEXT: retq 76 %shuffle.i = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27> 77 %shuffle.i59 = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 78 %1 = bitcast <16 x i16> %shuffle.i to <8 x i32> 79 %2 = shufflevector <8 x i32> %1, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 80 %3 = bitcast <16 x i16> %shuffle.i59 to <8 x i32> 81 %4 = shufflevector <8 x i32> %3, <8 x i32> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7> 82 %5 = insertvalue { <8 x i32>, <8 x i32> } undef, <8 x i32> %2, 0 83 %6 = insertvalue { <8 x i32>, <8 x i32> } %5, <8 x i32> %4, 1 84 ret { <8 x i32>, <8 x i32> } %6 85} 86;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 87; ALL: {{.*}} 88; AVX1OR2: {{.*}} 89; AVX2-FAST: {{.*}} 90; AVX2-FAST-ALL: {{.*}} 91; AVX2-FAST-PERLANE: {{.*}} 92; AVX2-SLOW: {{.*}} 93; AVX2OR512VL: {{.*}} 94; AVX512VL-FAST: {{.*}} 95; AVX512VL-FAST-CROSSLANE: {{.*}} 96; AVX512VL-FAST-PERLANE: {{.*}} 97; AVX512VL-SLOW: {{.*}} 98; XOP: {{.*}} 99