1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 | FileCheck %s --check-prefixes=SSE,SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 4; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX2 5; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512 6 7; Verify that we don't scalarize a packed vector shift left of 16-bit 8; signed integers if the amount is a constant build_vector. 9; Check that we produce a SSE2 packed integer multiply (pmullw) instead. 10 11define <8 x i16> @test1(<8 x i16> %a) { 12; SSE-LABEL: test1: 13; SSE: # %bb.0: 14; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,4,8,128,1,512,2048] 15; SSE-NEXT: retq 16; 17; AVX-LABEL: test1: 18; AVX: # %bb.0: 19; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [2,2,4,8,128,1,512,2048] 20; AVX-NEXT: retq 21 %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 22 ret <8 x i16> %shl 23} 24 25; Only two legal shift amounts, so we can lower to shuffle(psllw(),psllw()) 26 27define <8 x i16> @test2(<8 x i16> %a) { 28; SSE2-LABEL: test2: 29; SSE2: # %bb.0: 30; SSE2-NEXT: movdqa %xmm0, %xmm1 31; SSE2-NEXT: paddw %xmm0, %xmm1 32; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 33; SSE2-NEXT: retq 34; 35; SSE41-LABEL: test2: 36; SSE41: # %bb.0: 37; SSE41-NEXT: movdqa %xmm0, %xmm1 38; SSE41-NEXT: paddw %xmm0, %xmm1 39; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 40; SSE41-NEXT: retq 41; 42; AVX-LABEL: test2: 43; AVX: # %bb.0: 44; AVX-NEXT: vpaddw %xmm0, %xmm0, %xmm1 45; AVX-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 46; AVX-NEXT: retq 47 %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1> 48 ret <8 x i16> %shl 49} 50 51; Verify that a vector shift left of 32-bit signed integers is simply expanded 52; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift 53; counts is a constant build_vector. 54 55define <4 x i32> @test3(<4 x i32> %a) { 56; SSE2-LABEL: test3: 57; SSE2: # %bb.0: 58; SSE2-NEXT: movdqa %xmm0, %xmm1 59; SSE2-NEXT: paddd %xmm0, %xmm1 60; SSE2-NEXT: pslld $2, %xmm0 61; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 62; SSE2-NEXT: retq 63; 64; SSE41-LABEL: test3: 65; SSE41: # %bb.0: 66; SSE41-NEXT: movdqa %xmm0, %xmm1 67; SSE41-NEXT: pslld $2, %xmm1 68; SSE41-NEXT: paddd %xmm0, %xmm0 69; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 70; SSE41-NEXT: retq 71; 72; AVX-LABEL: test3: 73; AVX: # %bb.0: 74; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 75; AVX-NEXT: retq 76 %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3> 77 ret <4 x i32> %shl 78} 79 80define <4 x i32> @test4(<4 x i32> %a) { 81; SSE2-LABEL: test4: 82; SSE2: # %bb.0: 83; SSE2-NEXT: movdqa %xmm0, %xmm1 84; SSE2-NEXT: paddd %xmm0, %xmm1 85; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] 86; SSE2-NEXT: retq 87; 88; SSE41-LABEL: test4: 89; SSE41: # %bb.0: 90; SSE41-NEXT: movdqa %xmm0, %xmm1 91; SSE41-NEXT: paddd %xmm0, %xmm1 92; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] 93; SSE41-NEXT: retq 94; 95; AVX-LABEL: test4: 96; AVX: # %bb.0: 97; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 98; AVX-NEXT: retq 99 %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1> 100 ret <4 x i32> %shl 101} 102 103; If we have AVX/SSE2 but not AVX2, verify that the following shift is split 104; into two pmullw instructions. With AVX2, the test case below would produce 105; a single vpmullw. 106 107define <16 x i16> @test5(<16 x i16> %a) { 108; SSE-LABEL: test5: 109; SSE: # %bb.0: 110; SSE-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8,128,1,512,2048] 111; SSE-NEXT: pmullw %xmm2, %xmm0 112; SSE-NEXT: pmullw %xmm2, %xmm1 113; SSE-NEXT: retq 114; 115; AVX-LABEL: test5: 116; AVX: # %bb.0: 117; AVX-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 118; AVX-NEXT: retq 119 %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 120 ret <16 x i16> %shl 121} 122 123; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split 124; into two pmulld instructions. With AVX2, the test case below would produce 125; a single vpsllvd instead. 126 127define <8 x i32> @test6(<8 x i32> %a) { 128; SSE2-LABEL: test6: 129; SSE2: # %bb.0: 130; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [2,2,4,8] 131; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] 132; SSE2-NEXT: pmuludq %xmm2, %xmm0 133; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 134; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,8,8] 135; SSE2-NEXT: pmuludq %xmm4, %xmm3 136; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 137; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] 138; SSE2-NEXT: pmuludq %xmm1, %xmm2 139; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 140; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] 141; SSE2-NEXT: pmuludq %xmm4, %xmm1 142; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 143; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] 144; SSE2-NEXT: movdqa %xmm2, %xmm1 145; SSE2-NEXT: retq 146; 147; SSE41-LABEL: test6: 148; SSE41: # %bb.0: 149; SSE41-NEXT: pmovsxbd {{.*#+}} xmm2 = [2,2,4,8] 150; SSE41-NEXT: pmulld %xmm2, %xmm0 151; SSE41-NEXT: pmulld %xmm2, %xmm1 152; SSE41-NEXT: retq 153; 154; AVX-LABEL: test6: 155; AVX: # %bb.0: 156; AVX-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 157; AVX-NEXT: retq 158 %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 159 ret <8 x i32> %shl 160} 161 162; With AVX2 and AVX512, the test case below should produce a sequence of 163; two vpmullw instructions. On SSE2 instead, we split the shift in four 164; parts and then we convert each part into a pmullw. 165 166define <32 x i16> @test7(<32 x i16> %a) { 167; SSE-LABEL: test7: 168; SSE: # %bb.0: 169; SSE-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8,128,1,512,2048] 170; SSE-NEXT: pmullw %xmm4, %xmm0 171; SSE-NEXT: pmullw %xmm4, %xmm1 172; SSE-NEXT: pmullw %xmm4, %xmm2 173; SSE-NEXT: pmullw %xmm4, %xmm3 174; SSE-NEXT: retq 175; 176; AVX2-LABEL: test7: 177; AVX2: # %bb.0: 178; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 179; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 180; AVX2-NEXT: vpmullw %ymm2, %ymm0, %ymm0 181; AVX2-NEXT: vpmullw %ymm2, %ymm1, %ymm1 182; AVX2-NEXT: retq 183; 184; AVX512-LABEL: test7: 185; AVX512: # %bb.0: 186; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1 187; AVX512-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [2,2,4,8,128,1,512,2048,2,2,4,8,128,1,512,2048] 188; AVX512-NEXT: # ymm2 = mem[0,1,0,1] 189; AVX512-NEXT: vpmullw %ymm2, %ymm1, %ymm1 190; AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0 191; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 192; AVX512-NEXT: retq 193 %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11> 194 ret <32 x i16> %shl 195} 196 197; Similar to test7; the difference is that with AVX512 support 198; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq. 199 200define <16 x i32> @test8(<16 x i32> %a) { 201; SSE2-LABEL: test8: 202; SSE2: # %bb.0: 203; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [2,2,4,8] 204; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm0[1,1,3,3] 205; SSE2-NEXT: pmuludq %xmm4, %xmm0 206; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] 207; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [2,2,8,8] 208; SSE2-NEXT: pmuludq %xmm6, %xmm5 209; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 210; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 211; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm1[1,1,3,3] 212; SSE2-NEXT: pmuludq %xmm4, %xmm1 213; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] 214; SSE2-NEXT: pmuludq %xmm6, %xmm5 215; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 216; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1] 217; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm2[1,1,3,3] 218; SSE2-NEXT: pmuludq %xmm4, %xmm2 219; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] 220; SSE2-NEXT: pmuludq %xmm6, %xmm5 221; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3] 222; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1] 223; SSE2-NEXT: pmuludq %xmm3, %xmm4 224; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3] 225; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] 226; SSE2-NEXT: pmuludq %xmm6, %xmm3 227; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3] 228; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 229; SSE2-NEXT: movdqa %xmm4, %xmm3 230; SSE2-NEXT: retq 231; 232; SSE41-LABEL: test8: 233; SSE41: # %bb.0: 234; SSE41-NEXT: pmovsxbd {{.*#+}} xmm4 = [2,2,4,8] 235; SSE41-NEXT: pmulld %xmm4, %xmm0 236; SSE41-NEXT: pmulld %xmm4, %xmm1 237; SSE41-NEXT: pmulld %xmm4, %xmm2 238; SSE41-NEXT: pmulld %xmm4, %xmm3 239; SSE41-NEXT: retq 240; 241; AVX2-LABEL: test8: 242; AVX2: # %bb.0: 243; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [1,1,2,3,1,1,2,3] 244; AVX2-NEXT: # ymm2 = mem[0,1,0,1] 245; AVX2-NEXT: vpsllvd %ymm2, %ymm0, %ymm0 246; AVX2-NEXT: vpsllvd %ymm2, %ymm1, %ymm1 247; AVX2-NEXT: retq 248; 249; AVX512-LABEL: test8: 250; AVX512: # %bb.0: 251; AVX512-NEXT: vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 252; AVX512-NEXT: retq 253 %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3> 254 ret <16 x i32> %shl 255} 256 257; The shift from 'test9' gets shifted separately and blended if we don't have AVX2/AVX512f support. 258 259define <8 x i64> @test9(<8 x i64> %a) { 260; SSE2-LABEL: test9: 261; SSE2: # %bb.0: 262; SSE2-NEXT: movdqa %xmm1, %xmm4 263; SSE2-NEXT: psllq $2, %xmm4 264; SSE2-NEXT: psllq $3, %xmm1 265; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm4[0],xmm1[1] 266; SSE2-NEXT: movdqa %xmm3, %xmm4 267; SSE2-NEXT: psllq $2, %xmm4 268; SSE2-NEXT: psllq $3, %xmm3 269; SSE2-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1] 270; SSE2-NEXT: paddq %xmm0, %xmm0 271; SSE2-NEXT: paddq %xmm2, %xmm2 272; SSE2-NEXT: retq 273; 274; SSE41-LABEL: test9: 275; SSE41: # %bb.0: 276; SSE41-NEXT: movdqa %xmm1, %xmm4 277; SSE41-NEXT: psllq $3, %xmm4 278; SSE41-NEXT: psllq $2, %xmm1 279; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7] 280; SSE41-NEXT: movdqa %xmm3, %xmm4 281; SSE41-NEXT: psllq $3, %xmm4 282; SSE41-NEXT: psllq $2, %xmm3 283; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm4[4,5,6,7] 284; SSE41-NEXT: paddq %xmm0, %xmm0 285; SSE41-NEXT: paddq %xmm2, %xmm2 286; SSE41-NEXT: retq 287; 288; AVX2-LABEL: test9: 289; AVX2: # %bb.0: 290; AVX2-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,1,2,3] 291; AVX2-NEXT: vpsllvq %ymm2, %ymm0, %ymm0 292; AVX2-NEXT: vpsllvq %ymm2, %ymm1, %ymm1 293; AVX2-NEXT: retq 294; 295; AVX512-LABEL: test9: 296; AVX512: # %bb.0: 297; AVX512-NEXT: vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 298; AVX512-NEXT: retq 299 %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3> 300 ret <8 x i64> %shl 301} 302