1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL 9 10define <2 x i64> @unpckh_unary_extracted_v4i64(<4 x i64> %x) { 11; AVX1-LABEL: unpckh_unary_extracted_v4i64: 12; AVX1: # %bb.0: 13; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 14; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 15; AVX1-NEXT: vzeroupper 16; AVX1-NEXT: retq 17; 18; AVX2OR512VL-LABEL: unpckh_unary_extracted_v4i64: 19; AVX2OR512VL: # %bb.0: 20; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] 21; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 22; AVX2OR512VL-NEXT: vzeroupper 23; AVX2OR512VL-NEXT: retq 24 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 25 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 26 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 1, i32 3> 27 ret <2 x i64> %r 28} 29 30define <2 x double> @unpckh_unary_extracted_v8f64(<4 x double> %x) { 31; AVX1-LABEL: unpckh_unary_extracted_v8f64: 32; AVX1: # %bb.0: 33; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 34; AVX1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1] 35; AVX1-NEXT: vzeroupper 36; AVX1-NEXT: retq 37; 38; AVX2OR512VL-LABEL: unpckh_unary_extracted_v8f64: 39; AVX2OR512VL: # %bb.0: 40; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[1,3,2,3] 41; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 42; AVX2OR512VL-NEXT: vzeroupper 43; AVX2OR512VL-NEXT: retq 44 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1> 45 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3> 46 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 1, i32 3> 47 ret <2 x double> %r 48} 49 50; vpermps requires a constant load for the index op. It's unlikely to be profitable. 51 52define <4 x i32> @unpckh_unary_extracted_v8i32(<8 x i32> %x) { 53; ALL-LABEL: unpckh_unary_extracted_v8i32: 54; ALL: # %bb.0: 55; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 56; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 57; ALL-NEXT: vzeroupper 58; ALL-NEXT: retq 59 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 60 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 61 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 62 ret <4 x i32> %r 63} 64 65define <4 x float> @unpckh_unary_extracted_v8f32(<8 x float> %x) { 66; ALL-LABEL: unpckh_unary_extracted_v8f32: 67; ALL: # %bb.0: 68; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 69; ALL-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] 70; ALL-NEXT: vzeroupper 71; ALL-NEXT: retq 72 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 73 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 74 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 2, i32 6, i32 3, i32 7> 75 ret <4 x float> %r 76} 77 78define <8 x i16> @unpckh_unary_extracted_v16i16(<16 x i16> %x) { 79; AVX1-LABEL: unpckh_unary_extracted_v16i16: 80; AVX1: # %bb.0: 81; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 82; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 83; AVX1-NEXT: vzeroupper 84; AVX1-NEXT: retq 85; 86; AVX2OR512VL-LABEL: unpckh_unary_extracted_v16i16: 87; AVX2OR512VL: # %bb.0: 88; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 89; AVX2OR512VL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 90; AVX2OR512VL-NEXT: vzeroupper 91; AVX2OR512VL-NEXT: retq 92 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 93 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 94 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15> 95 ret <8 x i16> %r 96} 97 98define <16 x i8> @unpckh_unary_extracted_v32i8(<32 x i8> %x) { 99; AVX1-LABEL: unpckh_unary_extracted_v32i8: 100; AVX1: # %bb.0: 101; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 102; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 103; AVX1-NEXT: vzeroupper 104; AVX1-NEXT: retq 105; 106; AVX2OR512VL-LABEL: unpckh_unary_extracted_v32i8: 107; AVX2OR512VL: # %bb.0: 108; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 109; AVX2OR512VL-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 110; AVX2OR512VL-NEXT: vzeroupper 111; AVX2OR512VL-NEXT: retq 112 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 113 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 114 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31> 115 ret <16 x i8> %r 116} 117 118define <2 x i64> @unpckl_unary_extracted_v4i64(<4 x i64> %x) { 119; AVX1-LABEL: unpckl_unary_extracted_v4i64: 120; AVX1: # %bb.0: 121; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 122; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 123; AVX1-NEXT: vzeroupper 124; AVX1-NEXT: retq 125; 126; AVX2OR512VL-LABEL: unpckl_unary_extracted_v4i64: 127; AVX2OR512VL: # %bb.0: 128; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 129; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 130; AVX2OR512VL-NEXT: vzeroupper 131; AVX2OR512VL-NEXT: retq 132 %extrl = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 0, i32 1> 133 %extrh = shufflevector <4 x i64> %x, <4 x i64> undef, <2 x i32> <i32 2, i32 3> 134 %r = shufflevector <2 x i64> %extrl, <2 x i64> %extrh, <2 x i32> <i32 0, i32 2> 135 ret <2 x i64> %r 136} 137 138define <2 x double> @unpckl_unary_extracted_v8f64(<4 x double> %x) { 139; AVX1-LABEL: unpckl_unary_extracted_v8f64: 140; AVX1: # %bb.0: 141; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 142; AVX1-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 143; AVX1-NEXT: vzeroupper 144; AVX1-NEXT: retq 145; 146; AVX2OR512VL-LABEL: unpckl_unary_extracted_v8f64: 147; AVX2OR512VL: # %bb.0: 148; AVX2OR512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,2,3] 149; AVX2OR512VL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 150; AVX2OR512VL-NEXT: vzeroupper 151; AVX2OR512VL-NEXT: retq 152 %extrl = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 0, i32 1> 153 %extrh = shufflevector <4 x double> %x, <4 x double> undef, <2 x i32> <i32 2, i32 3> 154 %r = shufflevector <2 x double> %extrl, <2 x double> %extrh, <2 x i32> <i32 0, i32 2> 155 ret <2 x double> %r 156} 157 158; vpermps requires a constant load for the index op. It's unlikely to be profitable. 159 160define <4 x i32> @unpckl_unary_extracted_v8i32(<8 x i32> %x) { 161; ALL-LABEL: unpckl_unary_extracted_v8i32: 162; ALL: # %bb.0: 163; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 164; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 165; ALL-NEXT: vzeroupper 166; ALL-NEXT: retq 167 %extrl = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 168 %extrh = shufflevector <8 x i32> %x, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 169 %r = shufflevector <4 x i32> %extrl, <4 x i32> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 170 ret <4 x i32> %r 171} 172 173define <4 x float> @unpckl_unary_extracted_v8f32(<8 x float> %x) { 174; ALL-LABEL: unpckl_unary_extracted_v8f32: 175; ALL: # %bb.0: 176; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 177; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 178; ALL-NEXT: vzeroupper 179; ALL-NEXT: retq 180 %extrl = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> 181 %extrh = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> 182 %r = shufflevector <4 x float> %extrl, <4 x float> %extrh, <4 x i32> <i32 0, i32 4, i32 1, i32 5> 183 ret <4 x float> %r 184} 185 186define <8 x i16> @unpckl_unary_extracted_v16i16(<16 x i16> %x) { 187; AVX1-LABEL: unpckl_unary_extracted_v16i16: 188; AVX1: # %bb.0: 189; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 190; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 191; AVX1-NEXT: vzeroupper 192; AVX1-NEXT: retq 193; 194; AVX2OR512VL-LABEL: unpckl_unary_extracted_v16i16: 195; AVX2OR512VL: # %bb.0: 196; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 197; AVX2OR512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] 198; AVX2OR512VL-NEXT: vzeroupper 199; AVX2OR512VL-NEXT: retq 200 %extrl = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 201 %extrh = shufflevector <16 x i16> %x, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 202 %r = shufflevector <8 x i16> %extrl, <8 x i16> %extrh, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 203 ret <8 x i16> %r 204} 205 206define <16 x i8> @unpckl_unary_extracted_v32i8(<32 x i8> %x) { 207; AVX1-LABEL: unpckl_unary_extracted_v32i8: 208; AVX1: # %bb.0: 209; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 210; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 211; AVX1-NEXT: vzeroupper 212; AVX1-NEXT: retq 213; 214; AVX2OR512VL-LABEL: unpckl_unary_extracted_v32i8: 215; AVX2OR512VL: # %bb.0: 216; AVX2OR512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 217; AVX2OR512VL-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 218; AVX2OR512VL-NEXT: vzeroupper 219; AVX2OR512VL-NEXT: retq 220 %extrl = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 221 %extrh = shufflevector <32 x i8> %x, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 222 %r = shufflevector <16 x i8> %extrl, <16 x i8> %extrh, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 223 ret <16 x i8> %r 224} 225 226; This would infinite loop because we did not recognize the unpack shuffle mask in commuted form. 227 228define <8 x i32> @extract_unpckl_v8i32(<8 x i32> %a) { 229; ALL-LABEL: extract_unpckl_v8i32: 230; ALL: # %bb.0: 231; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1 232; ALL-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 233; ALL-NEXT: retq 234 %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 4, i32 undef, i32 5, i32 1, i32 undef, i32 undef, i32 undef, i32 undef> 235 ret <8 x i32> %shuffle 236} 237 238