1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512 6 7; Combine tests involving SSE41 target shuffles (BLEND,INSERTPS,MOVZX) 8 9declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) 10 11define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) { 12; SSE-LABEL: combine_vpshufb_as_movzx: 13; SSE: # %bb.0: 14; SSE-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 15; SSE-NEXT: retq 16; 17; AVX-LABEL: combine_vpshufb_as_movzx: 18; AVX: # %bb.0: 19; AVX-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero 20; AVX-NEXT: retq 21 %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 -1, i8 -1, i8 -1, i8 -1>) 22 ret <16 x i8> %res0 23} 24 25define <4 x i32> @combine_blend_of_permutes_v4i32(<2 x i64> %a0, <2 x i64> %a1) { 26; SSE-LABEL: combine_blend_of_permutes_v4i32: 27; SSE: # %bb.0: 28; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] 29; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 30; SSE-NEXT: retq 31; 32; AVX1-LABEL: combine_blend_of_permutes_v4i32: 33; AVX1: # %bb.0: 34; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 35; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] 36; AVX1-NEXT: retq 37; 38; AVX2-LABEL: combine_blend_of_permutes_v4i32: 39; AVX2: # %bb.0: 40; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3] 41; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,0,1] 42; AVX2-NEXT: retq 43; 44; AVX512-LABEL: combine_blend_of_permutes_v4i32: 45; AVX512: # %bb.0: 46; AVX512-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 47; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 48; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [2,19,0,17] 49; AVX512-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 50; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 51; AVX512-NEXT: vzeroupper 52; AVX512-NEXT: retq 53 %s0 = shufflevector <2 x i64> %a0, <2 x i64> undef, <2 x i32> <i32 1, i32 0> 54 %s1 = shufflevector <2 x i64> %a1, <2 x i64> undef, <2 x i32> <i32 1, i32 0> 55 %x0 = bitcast <2 x i64> %s0 to <4 x i32> 56 %x1 = bitcast <2 x i64> %s1 to <4 x i32> 57 %r = shufflevector <4 x i32> %x0, <4 x i32> %x1, <4 x i32> <i32 0, i32 5, i32 2, i32 7> 58 ret <4 x i32> %r 59} 60 61define <16 x i8> @PR50049(ptr %p1, ptr %p2) { 62; SSE-LABEL: PR50049: 63; SSE: # %bb.0: 64; SSE-NEXT: movdqa (%rdi), %xmm2 65; SSE-NEXT: movdqa 16(%rdi), %xmm3 66; SSE-NEXT: movdqa 32(%rdi), %xmm0 67; SSE-NEXT: movdqa (%rsi), %xmm4 68; SSE-NEXT: movdqa 16(%rsi), %xmm5 69; SSE-NEXT: movdqa 32(%rsi), %xmm1 70; SSE-NEXT: movdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u] 71; SSE-NEXT: pshufb %xmm6, %xmm3 72; SSE-NEXT: movdqa {{.*#+}} xmm7 = [0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u] 73; SSE-NEXT: pshufb %xmm7, %xmm2 74; SSE-NEXT: por %xmm3, %xmm2 75; SSE-NEXT: movdqa {{.*#+}} xmm3 = [0,1,2,3,4,5,6,7,8,9,10,128,128,128,128,128] 76; SSE-NEXT: pshufb %xmm3, %xmm2 77; SSE-NEXT: movdqa {{.*#+}} xmm8 = [128,128,128,128,128,128,128,128,128,128,128,1,4,7,10,13] 78; SSE-NEXT: pshufb %xmm8, %xmm0 79; SSE-NEXT: por %xmm2, %xmm0 80; SSE-NEXT: pshufb %xmm6, %xmm5 81; SSE-NEXT: pshufb %xmm7, %xmm4 82; SSE-NEXT: por %xmm5, %xmm4 83; SSE-NEXT: pshufb %xmm3, %xmm4 84; SSE-NEXT: pshufb %xmm8, %xmm1 85; SSE-NEXT: por %xmm4, %xmm1 86; SSE-NEXT: pmovzxbw {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 87; SSE-NEXT: movdqa %xmm1, %xmm3 88; SSE-NEXT: pand %xmm2, %xmm3 89; SSE-NEXT: movdqa %xmm0, %xmm4 90; SSE-NEXT: pmaddubsw %xmm3, %xmm4 91; SSE-NEXT: pand %xmm2, %xmm4 92; SSE-NEXT: pandn %xmm1, %xmm2 93; SSE-NEXT: pmaddubsw %xmm2, %xmm0 94; SSE-NEXT: psllw $8, %xmm0 95; SSE-NEXT: por %xmm4, %xmm0 96; SSE-NEXT: retq 97; 98; AVX1-LABEL: PR50049: 99; AVX1: # %bb.0: 100; AVX1-NEXT: vmovdqa (%rdi), %xmm0 101; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1 102; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2 103; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] 104; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 105; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] 106; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0 107; AVX1-NEXT: vpor %xmm2, %xmm0, %xmm0 108; AVX1-NEXT: vmovdqa (%rsi), %xmm2 109; AVX1-NEXT: vmovdqa 16(%rsi), %xmm5 110; AVX1-NEXT: vmovdqa 32(%rsi), %xmm6 111; AVX1-NEXT: vpshufb %xmm3, %xmm6, %xmm3 112; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 113; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 114; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] 115; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 116; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] 117; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 118; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 119; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1 120; AVX1-NEXT: vpshufb %xmm4, %xmm5, %xmm2 121; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1 122; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255] 123; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3 124; AVX1-NEXT: vpmaddubsw %xmm3, %xmm0, %xmm3 125; AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 126; AVX1-NEXT: vpandn %xmm1, %xmm2, %xmm1 127; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 128; AVX1-NEXT: vpsllw $8, %xmm0, %xmm0 129; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0 130; AVX1-NEXT: retq 131; 132; AVX2-LABEL: PR50049: 133; AVX2: # %bb.0: 134; AVX2-NEXT: vmovdqa (%rdi), %xmm0 135; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1 136; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2 137; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] 138; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 139; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] 140; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0 141; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0 142; AVX2-NEXT: vmovdqa (%rsi), %xmm2 143; AVX2-NEXT: vmovdqa 16(%rsi), %xmm5 144; AVX2-NEXT: vmovdqa 32(%rsi), %xmm6 145; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 146; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2 147; AVX2-NEXT: vpor %xmm3, %xmm2, %xmm2 148; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] 149; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 150; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] 151; AVX2-NEXT: vpshufb %xmm4, %xmm5, %xmm5 152; AVX2-NEXT: vpor %xmm5, %xmm2, %xmm2 153; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 154; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 155; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1 156; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 157; AVX2-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 158; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 159; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 160; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 161; AVX2-NEXT: vpackuswb %xmm1, %xmm0, %xmm0 162; AVX2-NEXT: vzeroupper 163; AVX2-NEXT: retq 164; 165; AVX512-LABEL: PR50049: 166; AVX512: # %bb.0: 167; AVX512-NEXT: vmovdqa (%rdi), %xmm0 168; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 169; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 170; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u] 171; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 172; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u] 173; AVX512-NEXT: vpshufb %xmm4, %xmm0, %xmm0 174; AVX512-NEXT: vpor %xmm2, %xmm0, %xmm0 175; AVX512-NEXT: vmovdqa (%rsi), %xmm2 176; AVX512-NEXT: vmovdqa 16(%rsi), %xmm5 177; AVX512-NEXT: vmovdqa 32(%rsi), %xmm6 178; AVX512-NEXT: vpshufb %xmm3, %xmm6, %xmm3 179; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 180; AVX512-NEXT: vpor %xmm3, %xmm2, %xmm2 181; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [5,6,7,8,9,10,128,128,128,128,128,0,1,2,3,4] 182; AVX512-NEXT: vpshufb %xmm3, %xmm2, %xmm2 183; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,2,5,8,11,14,128,128,128,128,128] 184; AVX512-NEXT: vpshufb %xmm4, %xmm5, %xmm5 185; AVX512-NEXT: vpor %xmm5, %xmm2, %xmm2 186; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero 187; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 188; AVX512-NEXT: vpshufb %xmm4, %xmm1, %xmm1 189; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 190; AVX512-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero 191; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 192; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 193; AVX512-NEXT: vpmovdb %zmm0, %xmm0 194; AVX512-NEXT: vzeroupper 195; AVX512-NEXT: retq 196 %x1 = load <48 x i8>, ptr %p1, align 16 197 %x2 = load <48 x i8>, ptr %p2, align 16 198 %s1 = shufflevector <48 x i8> %x1, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 199 %s2 = shufflevector <48 x i8> %x2, <48 x i8> poison, <16 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21, i32 24, i32 27, i32 30, i32 33, i32 36, i32 39, i32 42, i32 45> 200 %r = mul <16 x i8> %s1, %s2 201 ret <16 x i8> %r 202} 203