1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=CHECK,SSE,SSSE3 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=CHECK,AVX,AVX512F 7; 8; Combine tests involving SSE3/SSSE3 target shuffles (MOVDDUP, MOVSHDUP, MOVSLDUP, PSHUFB) 9 10declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) 11 12define <16 x i8> @combine_vpshufb_as_zero(<16 x i8> %a0) { 13; SSE-LABEL: combine_vpshufb_as_zero: 14; SSE: # %bb.0: 15; SSE-NEXT: xorps %xmm0, %xmm0 16; SSE-NEXT: retq 17; 18; AVX-LABEL: combine_vpshufb_as_zero: 19; AVX: # %bb.0: 20; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 21; AVX-NEXT: retq 22 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) 23 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 128, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>) 24 %res2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res1, <16 x i8> <i8 0, i8 1, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 25 ret <16 x i8> %res2 26} 27 28define <16 x i8> @combine_vpshufb_as_movq(<16 x i8> %a0) { 29; SSE-LABEL: combine_vpshufb_as_movq: 30; SSE: # %bb.0: 31; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 32; SSE-NEXT: retq 33; 34; AVX-LABEL: combine_vpshufb_as_movq: 35; AVX: # %bb.0: 36; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 37; AVX-NEXT: retq 38 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 128, i8 1, i8 128, i8 2, i8 128, i8 3, i8 128, i8 4, i8 128, i8 5, i8 128, i8 6, i8 128, i8 7, i8 128>) 39 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 1, i8 3, i8 5, i8 7, i8 9, i8 11, i8 13, i8 15>) 40 ret <16 x i8> %res1 41} 42 43define <2 x double> @combine_pshufb_as_movsd(<2 x double> %a0, <2 x double> %a1) { 44; SSSE3-LABEL: combine_pshufb_as_movsd: 45; SSSE3: # %bb.0: 46; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 47; SSSE3-NEXT: retq 48; 49; SSE41-LABEL: combine_pshufb_as_movsd: 50; SSE41: # %bb.0: 51; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 52; SSE41-NEXT: retq 53; 54; AVX-LABEL: combine_pshufb_as_movsd: 55; AVX: # %bb.0: 56; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 57; AVX-NEXT: retq 58 %1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 3, i32 0> 59 %2 = bitcast <2 x double> %1 to <16 x i8> 60 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 61 %4 = bitcast <16 x i8> %3 to <2 x double> 62 ret <2 x double> %4 63} 64 65define <4 x float> @combine_pshufb_as_movss(<4 x float> %a0, <4 x float> %a1) { 66; SSSE3-LABEL: combine_pshufb_as_movss: 67; SSSE3: # %bb.0: 68; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 69; SSSE3-NEXT: retq 70; 71; SSE41-LABEL: combine_pshufb_as_movss: 72; SSE41: # %bb.0: 73; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 74; SSE41-NEXT: retq 75; 76; AVX-LABEL: combine_pshufb_as_movss: 77; AVX: # %bb.0: 78; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] 79; AVX-NEXT: retq 80 %1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 3, i32 2, i32 1> 81 %2 = bitcast <4 x float> %1 to <16 x i8> 82 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 12, i8 13, i8 14, i8 15, i8 8, i8 9, i8 10, i8 11, i8 4, i8 5, i8 6, i8 7>) 83 %4 = bitcast <16 x i8> %3 to <4 x float> 84 ret <4 x float> %4 85} 86 87define <4 x i32> @combine_pshufb_as_zext(<16 x i8> %a0) { 88; SSSE3-LABEL: combine_pshufb_as_zext: 89; SSSE3: # %bb.0: 90; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 91; SSSE3-NEXT: retq 92; 93; SSE41-LABEL: combine_pshufb_as_zext: 94; SSE41: # %bb.0: 95; SSE41-NEXT: pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 96; SSE41-NEXT: retq 97; 98; AVX-LABEL: combine_pshufb_as_zext: 99; AVX: # %bb.0: 100; AVX-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero 101; AVX-NEXT: retq 102 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 2, i8 -1, i8 -1, i8 -1, i8 3, i8 -1, i8 -1, i8 -1>) 103 %2 = bitcast <16 x i8> %1 to <4 x i32> 104 ret <4 x i32> %2 105} 106 107define <2 x double> @combine_pshufb_as_vzmovl_64(<2 x double> %a0) { 108; SSE-LABEL: combine_pshufb_as_vzmovl_64: 109; SSE: # %bb.0: 110; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero 111; SSE-NEXT: retq 112; 113; AVX-LABEL: combine_pshufb_as_vzmovl_64: 114; AVX: # %bb.0: 115; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero 116; AVX-NEXT: retq 117 %1 = bitcast <2 x double> %a0 to <16 x i8> 118 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 119 %3 = bitcast <16 x i8> %2 to <2 x double> 120 ret <2 x double> %3 121} 122 123define <4 x float> @combine_pshufb_as_vzmovl_32(<4 x float> %a0) { 124; SSSE3-LABEL: combine_pshufb_as_vzmovl_32: 125; SSSE3: # %bb.0: 126; SSSE3-NEXT: xorps %xmm1, %xmm1 127; SSSE3-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] 128; SSSE3-NEXT: movaps %xmm1, %xmm0 129; SSSE3-NEXT: retq 130; 131; SSE41-LABEL: combine_pshufb_as_vzmovl_32: 132; SSE41: # %bb.0: 133; SSE41-NEXT: xorps %xmm1, %xmm1 134; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 135; SSE41-NEXT: retq 136; 137; AVX-LABEL: combine_pshufb_as_vzmovl_32: 138; AVX: # %bb.0: 139; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 140; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] 141; AVX-NEXT: retq 142 %1 = bitcast <4 x float> %a0 to <16 x i8> 143 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 144 %3 = bitcast <16 x i8> %2 to <4 x float> 145 ret <4 x float> %3 146} 147 148define <4 x float> @combine_pshufb_movddup(<4 x float> %a0) { 149; SSE-LABEL: combine_pshufb_movddup: 150; SSE: # %bb.0: 151; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7] 152; SSE-NEXT: retq 153; 154; AVX-LABEL: combine_pshufb_movddup: 155; AVX: # %bb.0: 156; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,7,7,7,7,5,5,5,5,7,7,7,7] 157; AVX-NEXT: retq 158 %1 = bitcast <4 x float> %a0 to <16 x i8> 159 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 160 %3 = bitcast <16 x i8> %2 to <4 x float> 161 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1> 162 ret <4 x float> %4 163} 164 165define <4 x float> @combine_pshufb_movshdup(<4 x float> %a0) { 166; SSE-LABEL: combine_pshufb_movshdup: 167; SSE: # %bb.0: 168; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3] 169; SSE-NEXT: retq 170; 171; AVX-LABEL: combine_pshufb_movshdup: 172; AVX: # %bb.0: 173; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[7,7,7,7,7,7,7,7,3,3,3,3,3,3,3,3] 174; AVX-NEXT: retq 175 %1 = bitcast <4 x float> %a0 to <16 x i8> 176 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 177 %3 = bitcast <16 x i8> %2 to <4 x float> 178 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3> 179 ret <4 x float> %4 180} 181 182define <4 x float> @combine_pshufb_movsldup(<4 x float> %a0) { 183; SSE-LABEL: combine_pshufb_movsldup: 184; SSE: # %bb.0: 185; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1] 186; SSE-NEXT: retq 187; 188; AVX-LABEL: combine_pshufb_movsldup: 189; AVX: # %bb.0: 190; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[5,5,5,5,5,5,5,5,1,1,1,1,1,1,1,1] 191; AVX-NEXT: retq 192 %1 = bitcast <4 x float> %a0 to <16 x i8> 193 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>) 194 %3 = bitcast <16 x i8> %2 to <4 x float> 195 %4 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2> 196 ret <4 x float> %4 197} 198 199define <16 x i8> @combine_pshufb_palignr(<16 x i8> %a0, <16 x i8> %a1) { 200; SSE-LABEL: combine_pshufb_palignr: 201; SSE: # %bb.0: 202; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 203; SSE-NEXT: retq 204; 205; AVX-LABEL: combine_pshufb_palignr: 206; AVX: # %bb.0: 207; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3] 208; AVX-NEXT: retq 209 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23> 210 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 211 ret <16 x i8> %2 212} 213 214define <16 x i8> @combine_pshufb_pslldq(<16 x i8> %a0) { 215; SSE-LABEL: combine_pshufb_pslldq: 216; SSE: # %bb.0: 217; SSE-NEXT: xorps %xmm0, %xmm0 218; SSE-NEXT: retq 219; 220; AVX-LABEL: combine_pshufb_pslldq: 221; AVX: # %bb.0: 222; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 223; AVX-NEXT: retq 224 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 225 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 226 ret <16 x i8> %2 227} 228 229define <16 x i8> @combine_pshufb_psrldq(<16 x i8> %a0) { 230; SSE-LABEL: combine_pshufb_psrldq: 231; SSE: # %bb.0: 232; SSE-NEXT: xorps %xmm0, %xmm0 233; SSE-NEXT: retq 234; 235; AVX-LABEL: combine_pshufb_psrldq: 236; AVX: # %bb.0: 237; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 238; AVX-NEXT: retq 239 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 240 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16> 241 ret <16 x i8> %2 242} 243 244define <16 x i8> @combine_and_pshufb(<16 x i8> %a0) { 245; SSSE3-LABEL: combine_and_pshufb: 246; SSSE3: # %bb.0: 247; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 248; SSSE3-NEXT: retq 249; 250; SSE41-LABEL: combine_and_pshufb: 251; SSE41: # %bb.0: 252; SSE41-NEXT: pxor %xmm1, %xmm1 253; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 254; SSE41-NEXT: retq 255; 256; AVX-LABEL: combine_and_pshufb: 257; AVX: # %bb.0: 258; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 259; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 260; AVX-NEXT: retq 261 %1 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 262 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 263 ret <16 x i8> %2 264} 265 266define <16 x i8> @combine_pshufb_and(<16 x i8> %a0) { 267; SSSE3-LABEL: combine_pshufb_and: 268; SSSE3: # %bb.0: 269; SSSE3-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 270; SSSE3-NEXT: retq 271; 272; SSE41-LABEL: combine_pshufb_and: 273; SSE41: # %bb.0: 274; SSE41-NEXT: pxor %xmm1, %xmm1 275; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 276; SSE41-NEXT: retq 277; 278; AVX-LABEL: combine_pshufb_and: 279; AVX: # %bb.0: 280; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 281; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4],xmm1[5,6,7] 282; AVX-NEXT: retq 283 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 284 %2 = shufflevector <16 x i8> %1, <16 x i8> zeroinitializer, <16 x i32> <i32 16, i32 16, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 285 ret <16 x i8> %2 286} 287 288define <16 x i8> @combine_pshufb_as_palignr(<16 x i8> %a0) { 289; SSE-LABEL: combine_pshufb_as_palignr: 290; SSE: # %bb.0: 291; SSE-NEXT: palignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] 292; SSE-NEXT: retq 293; 294; AVX-LABEL: combine_pshufb_as_palignr: 295; AVX: # %bb.0: 296; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,0] 297; AVX-NEXT: retq 298 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 undef, i8 undef, i8 0>) 299 ret <16 x i8> %res0 300} 301 302define <16 x i8> @combine_pshufb_as_pslldq(<16 x i8> %a0) { 303; SSE-LABEL: combine_pshufb_as_pslldq: 304; SSE: # %bb.0: 305; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 306; SSE-NEXT: retq 307; 308; AVX-LABEL: combine_pshufb_as_pslldq: 309; AVX: # %bb.0: 310; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] 311; AVX-NEXT: retq 312 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5>) 313 ret <16 x i8> %res0 314} 315 316define <16 x i8> @combine_pshufb_as_psrldq(<16 x i8> %a0) { 317; SSE-LABEL: combine_pshufb_as_psrldq: 318; SSE: # %bb.0: 319; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 320; SSE-NEXT: retq 321; 322; AVX-LABEL: combine_pshufb_as_psrldq: 323; AVX: # %bb.0: 324; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 325; AVX-NEXT: retq 326 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 15, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128, i8 128>) 327 ret <16 x i8> %res0 328} 329 330define <16 x i8> @combine_pshufb_as_psrlw(<16 x i8> %a0) { 331; SSE-LABEL: combine_pshufb_as_psrlw: 332; SSE: # %bb.0: 333; SSE-NEXT: psrlw $8, %xmm0 334; SSE-NEXT: retq 335; 336; AVX-LABEL: combine_pshufb_as_psrlw: 337; AVX: # %bb.0: 338; AVX-NEXT: vpsrlw $8, %xmm0, %xmm0 339; AVX-NEXT: retq 340 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 1, i8 128, i8 3, i8 128, i8 5, i8 128, i8 7, i8 128, i8 9, i8 128, i8 11, i8 128, i8 13, i8 128, i8 15, i8 128>) 341 ret <16 x i8> %res0 342} 343 344define <16 x i8> @combine_pshufb_as_pslld(<16 x i8> %a0) { 345; SSE-LABEL: combine_pshufb_as_pslld: 346; SSE: # %bb.0: 347; SSE-NEXT: pslld $24, %xmm0 348; SSE-NEXT: retq 349; 350; AVX-LABEL: combine_pshufb_as_pslld: 351; AVX: # %bb.0: 352; AVX-NEXT: vpslld $24, %xmm0, %xmm0 353; AVX-NEXT: retq 354 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 128, i8 128, i8 128, i8 0, i8 128, i8 128, i8 128, i8 4, i8 128, i8 128, i8 128, i8 8, i8 128, i8 128, i8 128, i8 12>) 355 ret <16 x i8> %res0 356} 357 358define <16 x i8> @combine_pshufb_as_psrlq(<16 x i8> %a0) { 359; SSE-LABEL: combine_pshufb_as_psrlq: 360; SSE: # %bb.0: 361; SSE-NEXT: psrlq $40, %xmm0 362; SSE-NEXT: retq 363; 364; AVX-LABEL: combine_pshufb_as_psrlq: 365; AVX: # %bb.0: 366; AVX-NEXT: vpsrlq $40, %xmm0, %xmm0 367; AVX-NEXT: retq 368 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 5, i8 6, i8 7, i8 128, i8 128, i8 128, i8 128, i8 128, i8 13, i8 14, i8 15, i8 128, i8 128, i8 128, i8 128, i8 128>) 369 ret <16 x i8> %res0 370} 371 372define <16 x i8> @combine_pshufb_as_pshuflw(<16 x i8> %a0) { 373; SSE-LABEL: combine_pshufb_as_pshuflw: 374; SSE: # %bb.0: 375; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 376; SSE-NEXT: retq 377; 378; AVX-LABEL: combine_pshufb_as_pshuflw: 379; AVX: # %bb.0: 380; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7] 381; AVX-NEXT: retq 382 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 383 ret <16 x i8> %res0 384} 385 386define <16 x i8> @combine_pshufb_as_pshufhw(<16 x i8> %a0) { 387; SSE-LABEL: combine_pshufb_as_pshufhw: 388; SSE: # %bb.0: 389; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 390; SSE-NEXT: retq 391; 392; AVX-LABEL: combine_pshufb_as_pshufhw: 393; AVX: # %bb.0: 394; AVX-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6] 395; AVX-NEXT: retq 396 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 397 ret <16 x i8> %res0 398} 399 400define <16 x i8> @combine_pshufb_not_as_pshufw(<16 x i8> %a0) { 401; SSE-LABEL: combine_pshufb_not_as_pshufw: 402; SSE: # %bb.0: 403; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 404; SSE-NEXT: retq 405; 406; AVX1-LABEL: combine_pshufb_not_as_pshufw: 407; AVX1: # %bb.0: 408; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 409; AVX1-NEXT: retq 410; 411; AVX2-LABEL: combine_pshufb_not_as_pshufw: 412; AVX2: # %bb.0: 413; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13] 414; AVX2-NEXT: retq 415; 416; AVX512F-LABEL: combine_pshufb_not_as_pshufw: 417; AVX512F: # %bb.0: 418; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 419; AVX512F-NEXT: vprold $16, %zmm0, %zmm0 420; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 421; AVX512F-NEXT: vzeroupper 422; AVX512F-NEXT: retq 423 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 2, i8 3, i8 0, i8 1, i8 6, i8 7, i8 4, i8 5, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 424 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 10, i8 11, i8 8, i8 9, i8 14, i8 15, i8 12, i8 13>) 425 ret <16 x i8> %res1 426} 427 428define <16 x i8> @combine_vpshufb_as_pshuflw_not_pslld(ptr%a0) { 429; SSE-LABEL: combine_vpshufb_as_pshuflw_not_pslld: 430; SSE: # %bb.0: 431; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7] 432; SSE-NEXT: retq 433; 434; AVX-LABEL: combine_vpshufb_as_pshuflw_not_pslld: 435; AVX: # %bb.0: 436; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,2,2,4,5,6,7] 437; AVX-NEXT: retq 438 %res0 = load <16 x i8>, ptr%a0, align 16 439 %res1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %res0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 440 ret <16 x i8> %res1 441} 442 443define <16 x i8> @combine_pshufb_as_unary_unpcklbw(<16 x i8> %a0) { 444; SSE-LABEL: combine_pshufb_as_unary_unpcklbw: 445; SSE: # %bb.0: 446; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 447; SSE-NEXT: retq 448; 449; AVX-LABEL: combine_pshufb_as_unary_unpcklbw: 450; AVX: # %bb.0: 451; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] 452; AVX-NEXT: retq 453 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 undef, i8 undef, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 5, i8 5, i8 6, i8 6, i8 7, i8 7>) 454 ret <16 x i8> %1 455} 456 457define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) { 458; SSE-LABEL: combine_pshufb_as_unary_unpckhwd: 459; SSE: # %bb.0: 460; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 461; SSE-NEXT: retq 462; 463; AVX-LABEL: combine_pshufb_as_unary_unpckhwd: 464; AVX: # %bb.0: 465; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7] 466; AVX-NEXT: retq 467 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 9, i8 8, i8 9, i8 10, i8 11, i8 10, i8 11, i8 12, i8 13, i8 12, i8 13, i8 14, i8 15, i8 undef, i8 undef>) 468 ret <16 x i8> %1 469} 470 471define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) { 472; CHECK-LABEL: combine_pshufb_as_unpacklo_undef: 473; CHECK: # %bb.0: 474; CHECK-NEXT: retq 475 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 2, i8 3, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 6, i8 7>) 476 %2 = bitcast <16 x i8> %1 to <8 x i16> 477 %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6> 478 ret <8 x i16> %3 479} 480 481define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) { 482; CHECK-LABEL: combine_pshufb_as_unpackhi_undef: 483; CHECK: # %bb.0: 484; CHECK-NEXT: retq 485 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 undef, i8 10, i8 undef, i8 11, i8 undef, i8 12, i8 undef, i8 13, i8 undef, i8 14, i8 undef, i8 15, i8 undef>) 486 %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15> 487 ret <16 x i8> %2 488} 489 490define <16 x i8> @combine_pshufb_as_unpacklo_zero(<16 x i8> %a0) { 491; SSE-LABEL: combine_pshufb_as_unpacklo_zero: 492; SSE: # %bb.0: 493; SSE-NEXT: xorps %xmm1, %xmm1 494; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 495; SSE-NEXT: movaps %xmm1, %xmm0 496; SSE-NEXT: retq 497; 498; AVX-LABEL: combine_pshufb_as_unpacklo_zero: 499; AVX: # %bb.0: 500; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 501; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 502; AVX-NEXT: retq 503 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 6, i8 7>) 504 ret <16 x i8> %1 505} 506 507define <16 x i8> @combine_pshufb_as_unpackhi_zero(<16 x i8> %a0) { 508; SSE-LABEL: combine_pshufb_as_unpackhi_zero: 509; SSE: # %bb.0: 510; SSE-NEXT: pxor %xmm1, %xmm1 511; SSE-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 512; SSE-NEXT: retq 513; 514; AVX-LABEL: combine_pshufb_as_unpackhi_zero: 515; AVX: # %bb.0: 516; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 517; AVX-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] 518; AVX-NEXT: retq 519 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1>) 520 ret <16 x i8> %1 521} 522 523define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) { 524; SSE-LABEL: combine_psrlw_pshufb: 525; SSE: # %bb.0: 526; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero 527; SSE-NEXT: retq 528; 529; AVX-LABEL: combine_psrlw_pshufb: 530; AVX: # %bb.0: 531; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero 532; AVX-NEXT: retq 533 %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 534 %2 = bitcast <8 x i16> %1 to <16 x i8> 535 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>) 536 ret <16 x i8> %3 537} 538 539define <16 x i8> @combine_pslld_pshufb(<4 x i32> %a0) { 540; SSE-LABEL: combine_pslld_pshufb: 541; SSE: # %bb.0: 542; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero 543; SSE-NEXT: retq 544; 545; AVX-LABEL: combine_pslld_pshufb: 546; AVX: # %bb.0: 547; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,1,0],zero,xmm0[6,5,4],zero,xmm0[10,9,8],zero,xmm0[14,13,12],zero 548; AVX-NEXT: retq 549 %1 = shl <4 x i32> %a0, <i32 8, i32 8, i32 8, i32 8> 550 %2 = bitcast <4 x i32> %1 to <16 x i8> 551 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 11, i8 10, i8 9, i8 8, i8 15, i8 14, i8 13, i8 12>) 552 ret <16 x i8> %3 553} 554 555define <16 x i8> @combine_psrlq_pshufb(<2 x i64> %a0) { 556; SSE-LABEL: combine_psrlq_pshufb: 557; SSE: # %bb.0: 558; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14] 559; SSE-NEXT: retq 560; 561; AVX-LABEL: combine_psrlq_pshufb: 562; AVX: # %bb.0: 563; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[7,6],zero,zero,zero,zero,zero,zero,xmm0[15,14] 564; AVX-NEXT: retq 565 %1 = lshr <2 x i64> %a0, <i64 48, i64 48> 566 %2 = bitcast <2 x i64> %1 to <16 x i8> 567 %3 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>) 568 ret <16 x i8> %3 569} 570 571define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 572; SSE-LABEL: combine_unpckl_arg0_pshufb: 573; SSE: # %bb.0: 574; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 575; SSE-NEXT: retq 576; 577; AVX-LABEL: combine_unpckl_arg0_pshufb: 578; AVX: # %bb.0: 579; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 580; AVX-NEXT: retq 581 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 582 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1>) 583 ret <16 x i8> %2 584} 585 586define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 587; SSE-LABEL: combine_unpckl_arg1_pshufb: 588; SSE: # %bb.0: 589; SSE-NEXT: movdqa %xmm1, %xmm0 590; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero 591; SSE-NEXT: retq 592; 593; AVX-LABEL: combine_unpckl_arg1_pshufb: 594; AVX: # %bb.0: 595; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero 596; AVX-NEXT: retq 597 %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23> 598 %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1, i8 1, i8 -1, i8 -1, i8 -1>) 599 ret <16 x i8> %2 600} 601 602define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) { 603; SSE-LABEL: shuffle_combine_unpack_insert: 604; SSE: # %bb.0: 605; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11] 606; SSE-NEXT: retq 607; 608; AVX-LABEL: shuffle_combine_unpack_insert: 609; AVX: # %bb.0: 610; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11] 611; AVX-NEXT: retq 612 %1 = extractelement <8 x i16> %a0, i32 2 613 %2 = extractelement <8 x i16> %a0, i32 4 614 %3 = insertelement <8 x i16> %a0, i16 %1, i32 4 615 %4 = insertelement <8 x i16> %a0, i16 %2, i32 2 616 %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 617 %6 = shufflevector <8 x i16> %5, <8 x i16> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> 618 %7 = shufflevector <8 x i16> %5, <8 x i16> %a0, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef> 619 %8 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11> 620 ret <8 x i16> %8 621} 622 623define <16 x i8> @shuffle_combine_packssdw_pshufb(<4 x i32> %a0) { 624; SSE-LABEL: shuffle_combine_packssdw_pshufb: 625; SSE: # %bb.0: 626; SSE-NEXT: psrad $31, %xmm0 627; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0] 628; SSE-NEXT: retq 629; 630; AVX-LABEL: shuffle_combine_packssdw_pshufb: 631; AVX: # %bb.0: 632; AVX-NEXT: vpsrad $31, %xmm0, %xmm0 633; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13,12,9,8,5,4,1,0,13,12,9,8,5,4,1,0] 634; AVX-NEXT: retq 635 %1 = ashr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31> 636 %2 = tail call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %1, <4 x i32> %1) 637 %3 = bitcast <8 x i16> %2 to <16 x i8> 638 %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8>) 639 ret <16 x i8> %4 640} 641declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone 642 643define <16 x i8> @shuffle_combine_packsswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) { 644; SSE-LABEL: shuffle_combine_packsswb_pshufb: 645; SSE: # %bb.0: 646; SSE-NEXT: psraw $15, %xmm0 647; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0] 648; SSE-NEXT: retq 649; 650; AVX-LABEL: shuffle_combine_packsswb_pshufb: 651; AVX: # %bb.0: 652; AVX-NEXT: vpsraw $15, %xmm0, %xmm0 653; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,12,10,8,6,4,2,0,14,12,10,8,6,4,2,0] 654; AVX-NEXT: retq 655 %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 656 %2 = ashr <8 x i16> %a1, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15> 657 %3 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %1, <8 x i16> %2) 658 %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 659 ret <16 x i8> %4 660} 661declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone 662 663define <16 x i8> @shuffle_combine_packuswb_pshufb(<8 x i16> %a0, <8 x i16> %a1) { 664; SSE-LABEL: shuffle_combine_packuswb_pshufb: 665; SSE: # %bb.0: 666; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1] 667; SSE-NEXT: retq 668; 669; AVX-LABEL: shuffle_combine_packuswb_pshufb: 670; AVX: # %bb.0: 671; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[15,13,11,9,7,5,3,1,15,13,11,9,7,5,3,1] 672; AVX-NEXT: retq 673 %1 = lshr <8 x i16> %a0, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 674 %2 = lshr <8 x i16> %a1, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8> 675 %3 = tail call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %1, <8 x i16> %2) 676 %4 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>) 677 ret <16 x i8> %4 678} 679declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone 680 681define <16 x i8> @combine_pshufb_pshufb_or_as_blend(<16 x i8> %a0, <16 x i8> %a1) { 682; SSSE3-LABEL: combine_pshufb_pshufb_or_as_blend: 683; SSSE3: # %bb.0: 684; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 685; SSSE3-NEXT: retq 686; 687; SSE41-LABEL: combine_pshufb_pshufb_or_as_blend: 688; SSE41: # %bb.0: 689; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 690; SSE41-NEXT: retq 691; 692; AVX-LABEL: combine_pshufb_pshufb_or_as_blend: 693; AVX: # %bb.0: 694; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 695; AVX-NEXT: retq 696 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>) 697 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>) 698 %3 = or <16 x i8> %1, %2 699 ret <16 x i8> %3 700} 701 702define <16 x i8> @combine_pshufb_pshufb_or_as_unpcklbw(<16 x i8> %a0, <16 x i8> %a1) { 703; SSE-LABEL: combine_pshufb_pshufb_or_as_unpcklbw: 704; SSE: # %bb.0: 705; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 706; SSE-NEXT: retq 707; 708; AVX-LABEL: combine_pshufb_pshufb_or_as_unpcklbw: 709; AVX: # %bb.0: 710; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] 711; AVX-NEXT: retq 712 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7, i8 -1>) 713 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 -1, i8 0, i8 -1, i8 1, i8 -1, i8 2, i8 -1, i8 3, i8 -1, i8 4, i8 -1, i8 5, i8 -1, i8 6, i8 -1, i8 7>) 714 %3 = or <16 x i8> %1, %2 715 ret <16 x i8> %3 716} 717 718define <16 x i8> @combine_pshufb_pshufb_or_pshufb(<16 x i8> %a0) { 719; SSE-LABEL: combine_pshufb_pshufb_or_pshufb: 720; SSE: # %bb.0: 721; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] 722; SSE-NEXT: retq 723; 724; AVX1-LABEL: combine_pshufb_pshufb_or_pshufb: 725; AVX1: # %bb.0: 726; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0] 727; AVX1-NEXT: retq 728; 729; AVX2-LABEL: combine_pshufb_pshufb_or_pshufb: 730; AVX2: # %bb.0: 731; AVX2-NEXT: vbroadcastss %xmm0, %xmm0 732; AVX2-NEXT: retq 733; 734; AVX512F-LABEL: combine_pshufb_pshufb_or_pshufb: 735; AVX512F: # %bb.0: 736; AVX512F-NEXT: vbroadcastss %xmm0, %xmm0 737; AVX512F-NEXT: retq 738 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1>) 739 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3>) 740 %3 = or <16 x i8> %1, %2 741 %4 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %3, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>) 742 ret <16 x i8> %4 743} 744 745define <16 x i8> @combine_and_pshufb_or_pshufb(<16 x i8> %a0, <16 x i8> %a1) { 746; SSE-LABEL: combine_and_pshufb_or_pshufb: 747; SSE: # %bb.0: 748; SSE-NEXT: pshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 749; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[7],zero,xmm1[0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 750; SSE-NEXT: por %xmm1, %xmm0 751; SSE-NEXT: retq 752; 753; AVX-LABEL: combine_and_pshufb_or_pshufb: 754; AVX: # %bb.0: 755; AVX-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[15],zero,xmm0[1],zero,xmm0[14],zero,xmm0[2],zero,xmm0[13],zero,xmm0[3],zero,zero 756; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[7],zero,xmm1[0],zero,xmm1[8],zero,xmm1[1],zero,xmm1[9],zero,xmm1[10],zero,xmm1[7],zero,xmm1[7],zero 757; AVX-NEXT: vpor %xmm0, %xmm1, %xmm0 758; AVX-NEXT: retq 759 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 15, i8 -1, i8 1, i8 -1, i8 14, i8 -1, i8 2, i8 -1, i8 13, i8 -1, i8 3, i8 -1, i8 -1>) 760 %2 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a1, <16 x i8> <i8 7, i8 -1, i8 0, i8 -1, i8 8, i8 -1, i8 1, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 7, i8 -1, i8 7, i8 -1>) 761 %3 = or <16 x i8> %1, %2 762 %4 = and <16 x i8> %3, <i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 763 ret <16 x i8> %4 764} 765 766define <16 x i8> @constant_fold_pshufb() { 767; SSE-LABEL: constant_fold_pshufb: 768; SSE: # %bb.0: 769; SSE-NEXT: movaps {{.*#+}} xmm0 = [14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9] 770; SSE-NEXT: retq 771; 772; AVX-LABEL: constant_fold_pshufb: 773; AVX: # %bb.0: 774; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9] 775; AVX-NEXT: retq 776 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>) 777 ret <16 x i8> %1 778} 779 780define <16 x i8> @constant_fold_pshufb_2() { 781; SSE-LABEL: constant_fold_pshufb_2: 782; SSE: # %bb.0: 783; SSE-NEXT: movl $2, %eax 784; SSE-NEXT: movd %eax, %xmm0 785; SSE-NEXT: retq 786; 787; AVX-LABEL: constant_fold_pshufb_2: 788; AVX: # %bb.0: 789; AVX-NEXT: movl $2, %eax 790; AVX-NEXT: vmovd %eax, %xmm0 791; AVX-NEXT: retq 792 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>) 793 ret <16 x i8> %1 794} 795 796define i32 @mask_zzz3_v16i8(<16 x i8> %a0) { 797; SSSE3-LABEL: mask_zzz3_v16i8: 798; SSSE3: # %bb.0: 799; SSSE3-NEXT: psrldq {{.*#+}} xmm0 = xmm0[11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 800; SSSE3-NEXT: movd %xmm0, %eax 801; SSSE3-NEXT: andl $-16777216, %eax # imm = 0xFF000000 802; SSSE3-NEXT: retq 803; 804; SSE41-LABEL: mask_zzz3_v16i8: 805; SSE41: # %bb.0: 806; SSE41-NEXT: psllw $8, %xmm0 807; SSE41-NEXT: pextrd $3, %xmm0, %eax 808; SSE41-NEXT: andl $-16777216, %eax # imm = 0xFF000000 809; SSE41-NEXT: retq 810; 811; AVX-LABEL: mask_zzz3_v16i8: 812; AVX: # %bb.0: 813; AVX-NEXT: vpsllw $8, %xmm0, %xmm0 814; AVX-NEXT: vpextrd $3, %xmm0, %eax 815; AVX-NEXT: andl $-16777216, %eax # imm = 0xFF000000 816; AVX-NEXT: retq 817 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>) 818 %2 = bitcast <16 x i8> %1 to <4 x i32> 819 %3 = extractelement <4 x i32> %2, i32 3 820 %4 = and i32 %3, 4278190080 821 ret i32 %4 822} 823 824define i32 @mask_z1z3_v16i8(<16 x i8> %a0) { 825; SSSE3-LABEL: mask_z1z3_v16i8: 826; SSSE3: # %bb.0: 827; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = zero,xmm0[10],zero,xmm0[14,u,u,u,u,u,u,u,u,u,u,u,u] 828; SSSE3-NEXT: movd %xmm0, %eax 829; SSSE3-NEXT: retq 830; 831; SSE41-LABEL: mask_z1z3_v16i8: 832; SSE41: # %bb.0: 833; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14] 834; SSE41-NEXT: pextrd $3, %xmm0, %eax 835; SSE41-NEXT: retq 836; 837; AVX-LABEL: mask_z1z3_v16i8: 838; AVX: # %bb.0: 839; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u],zero,xmm0[10],zero,xmm0[14] 840; AVX-NEXT: vpextrd $3, %xmm0, %eax 841; AVX-NEXT: retq 842 %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14, i8 0, i8 2, i8 4, i8 6, i8 8, i8 10, i8 12, i8 14>) 843 %2 = bitcast <16 x i8> %1 to <4 x i32> 844 %3 = extractelement <4 x i32> %2, i32 3 845 %4 = and i32 %3, 4278255360 846 ret i32 %4 847} 848 849define i32 @PR22415(double %a0) { 850; SSE-LABEL: PR22415: 851; SSE: # %bb.0: 852; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 853; SSE-NEXT: movd %xmm0, %eax 854; SSE-NEXT: retq 855; 856; AVX-LABEL: PR22415: 857; AVX: # %bb.0: 858; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u] 859; AVX-NEXT: vmovd %xmm0, %eax 860; AVX-NEXT: retq 861 %1 = bitcast double %a0 to <8 x i8> 862 %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 undef> 863 %3 = shufflevector <4 x i8> %2, <4 x i8> undef, <3 x i32> <i32 0, i32 1, i32 2> 864 %4 = bitcast <3 x i8> %3 to i24 865 %5 = zext i24 %4 to i32 866 ret i32 %5 867} 868