1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X86 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,X64 4 5; Verify that we correctly fold target specific packed vector shifts by 6; immediate count into a simple build_vector when the elements of the vector 7; in input to the packed shift are all constants or undef. 8 9define <8 x i16> @test1() { 10; CHECK-LABEL: test1: 11; CHECK: # %bb.0: 12; CHECK-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64,8,16,32,64] 13; CHECK-NEXT: ret{{[l|q]}} 14 %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3) 15 ret <8 x i16> %1 16} 17 18define <8 x i16> @test2() { 19; CHECK-LABEL: test2: 20; CHECK: # %bb.0: 21; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4] 22; CHECK-NEXT: ret{{[l|q]}} 23 %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3) 24 ret <8 x i16> %1 25} 26 27define <8 x i16> @test3() { 28; CHECK-LABEL: test3: 29; CHECK: # %bb.0: 30; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4,0,1,2,4] 31; CHECK-NEXT: ret{{[l|q]}} 32 %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3) 33 ret <8 x i16> %1 34} 35 36define <4 x i32> @test4() { 37; CHECK-LABEL: test4: 38; CHECK: # %bb.0: 39; CHECK-NEXT: movaps {{.*#+}} xmm0 = [8,16,32,64] 40; CHECK-NEXT: ret{{[l|q]}} 41 %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3) 42 ret <4 x i32> %1 43} 44 45define <4 x i32> @test5() { 46; CHECK-LABEL: test5: 47; CHECK: # %bb.0: 48; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4] 49; CHECK-NEXT: ret{{[l|q]}} 50 %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3) 51 ret <4 x i32> %1 52} 53 54define <4 x i32> @test6() { 55; CHECK-LABEL: test6: 56; CHECK: # %bb.0: 57; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,2,4] 58; CHECK-NEXT: ret{{[l|q]}} 59 %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3) 60 ret <4 x i32> %1 61} 62 63define <2 x i64> @test7() { 64; X86-LABEL: test7: 65; X86: # %bb.0: 66; X86-NEXT: movaps {{.*#+}} xmm0 = [8,0,16,0] 67; X86-NEXT: retl 68; 69; X64-LABEL: test7: 70; X64: # %bb.0: 71; X64-NEXT: movaps {{.*#+}} xmm0 = [8,16] 72; X64-NEXT: retq 73 %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3) 74 ret <2 x i64> %1 75} 76 77define <2 x i64> @test8() { 78; X86-LABEL: test8: 79; X86: # %bb.0: 80; X86-NEXT: movaps {{.*#+}} xmm0 = [1,0,2,0] 81; X86-NEXT: retl 82; 83; X64-LABEL: test8: 84; X64: # %bb.0: 85; X64-NEXT: movaps {{.*#+}} xmm0 = [1,2] 86; X64-NEXT: retq 87 %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3) 88 ret <2 x i64> %1 89} 90 91define <8 x i16> @test9() { 92; CHECK-LABEL: test9: 93; CHECK: # %bb.0: 94; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16] 95; CHECK-NEXT: ret{{[l|q]}} 96 %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3) 97 ret <8 x i16> %1 98} 99 100define <4 x i32> @test10() { 101; CHECK-LABEL: test10: 102; CHECK: # %bb.0: 103; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,0,4] 104; CHECK-NEXT: ret{{[l|q]}} 105 %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3) 106 ret <4 x i32> %1 107} 108 109define <2 x i64> @test11() { 110; X86-LABEL: test11: 111; X86: # %bb.0: 112; X86-NEXT: movaps {{.*#+}} xmm0 = [0,0,3,0] 113; X86-NEXT: retl 114; 115; X64-LABEL: test11: 116; X64: # %bb.0: 117; X64-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0] 118; X64-NEXT: retq 119 %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3) 120 ret <2 x i64> %1 121} 122 123define <8 x i16> @test12() { 124; CHECK-LABEL: test12: 125; CHECK: # %bb.0: 126; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16] 127; CHECK-NEXT: ret{{[l|q]}} 128 %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3) 129 ret <8 x i16> %1 130} 131 132define <4 x i32> @test13() { 133; CHECK-LABEL: test13: 134; CHECK: # %bb.0: 135; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,1,0,4] 136; CHECK-NEXT: ret{{[l|q]}} 137 %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3) 138 ret <4 x i32> %1 139} 140 141define <8 x i16> @test14() { 142; CHECK-LABEL: test14: 143; CHECK: # %bb.0: 144; CHECK-NEXT: movaps {{.*#+}} xmm0 = [1,1,0,0,3,0,8,16] 145; CHECK-NEXT: ret{{[l|q]}} 146 %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3) 147 ret <8 x i16> %1 148} 149 150define <4 x i32> @test15() { 151; CHECK-LABEL: test15: 152; CHECK: # %bb.0: 153; CHECK-NEXT: movaps {{.*#+}} xmm0 = [0,64,0,256] 154; CHECK-NEXT: ret{{[l|q]}} 155 %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3) 156 ret <4 x i32> %1 157} 158 159define <2 x i64> @test16() { 160; X86-LABEL: test16: 161; X86: # %bb.0: 162; X86-NEXT: movaps {{.*#+}} xmm0 = [0,0,248,0] 163; X86-NEXT: retl 164; 165; X64-LABEL: test16: 166; X64: # %bb.0: 167; X64-NEXT: movaps {{.*#+}} xmm0 = [0,0,0,0,0,0,0,0,248,0,0,0,0,0,0,0] 168; X64-NEXT: retq 169 %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3) 170 ret <2 x i64> %1 171} 172 173; Make sure we fold fully undef input vectors. We previously folded only when 174; undef had a single use so use 2 undefs. 175define <4 x i32> @test17(<4 x i32> %a0, ptr %dummy) { 176; X86-LABEL: test17: 177; X86: # %bb.0: 178; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 179; X86-NEXT: xorps %xmm0, %xmm0 180; X86-NEXT: movaps %xmm0, (%eax) 181; X86-NEXT: retl 182; 183; X64-LABEL: test17: 184; X64: # %bb.0: 185; X64-NEXT: xorps %xmm0, %xmm0 186; X64-NEXT: movaps %xmm0, (%rdi) 187; X64-NEXT: retq 188 %a = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 6) 189 store <4 x i32> %a, ptr %dummy 190 %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 7) 191 ret <4 x i32> %res 192} 193 194define <4 x i32> @test18(<4 x i32> %a0, ptr %dummy) { 195; X86-LABEL: test18: 196; X86: # %bb.0: 197; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 198; X86-NEXT: xorps %xmm0, %xmm0 199; X86-NEXT: movaps %xmm0, (%eax) 200; X86-NEXT: retl 201; 202; X64-LABEL: test18: 203; X64: # %bb.0: 204; X64-NEXT: xorps %xmm0, %xmm0 205; X64-NEXT: movaps %xmm0, (%rdi) 206; X64-NEXT: retq 207 %a = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 3) 208 store <4 x i32> %a, ptr %dummy 209 %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> undef, i32 1) 210 ret <4 x i32> %res 211} 212 213; PR39482 214 215define <4 x i32> @extelt0_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y){ 216; CHECK-LABEL: extelt0_sub_pslli_v4i32: 217; CHECK: # %bb.0: 218; CHECK-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32] 219; CHECK-NEXT: psubd %xmm1, %xmm2 220; CHECK-NEXT: pxor %xmm1, %xmm1 221; CHECK-NEXT: movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3] 222; CHECK-NEXT: pslld %xmm1, %xmm0 223; CHECK-NEXT: ret{{[l|q]}} 224 %ext = extractelement <4 x i32> %y, i64 0 225 %bo = sub i32 32, %ext 226 %r = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 %bo) 227 ret <4 x i32> %r 228} 229 230define <4 x i32> @extelt1_add_psrli_v4i32(<4 x i32> %x, <4 x i32> %y){ 231; X86-LABEL: extelt1_add_psrli_v4i32: 232; X86: # %bb.0: 233; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 234; X86-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 235; X86-NEXT: xorps %xmm2, %xmm2 236; X86-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 237; X86-NEXT: psrld %xmm2, %xmm0 238; X86-NEXT: retl 239; 240; X64-LABEL: extelt1_add_psrli_v4i32: 241; X64: # %bb.0: 242; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 243; X64-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 244; X64-NEXT: xorps %xmm2, %xmm2 245; X64-NEXT: movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3] 246; X64-NEXT: psrld %xmm2, %xmm0 247; X64-NEXT: retq 248 %ext = extractelement <4 x i32> %y, i64 1 249 %bo = add i32 %ext, 3 250 %r = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 %bo) 251 ret <4 x i32> %r 252} 253 254define i32 @extelt1_add_psrai_v4i32_uses(<4 x i32> %x, <4 x i32> %y){ 255; CHECK-LABEL: extelt1_add_psrai_v4i32_uses: 256; CHECK: # %bb.0: 257; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] 258; CHECK-NEXT: movd %xmm1, %ecx 259; CHECK-NEXT: addl $3, %ecx 260; CHECK-NEXT: movd %ecx, %xmm1 261; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3] 262; CHECK-NEXT: psrad %xmm1, %xmm0 263; CHECK-NEXT: movd %xmm0, %eax 264; CHECK-NEXT: imull %ecx, %eax 265; CHECK-NEXT: ret{{[l|q]}} 266 %ext = extractelement <4 x i32> %y, i64 1 267 %bo = add i32 %ext, 3 268 %r = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 %bo) 269 %ext3 = extractelement <4 x i32> %r, i64 3 270 %r2 = mul i32 %bo, %ext3 271 ret i32 %r2 272} 273 274define <4 x i32> @extelt0_twice_sub_pslli_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z){ 275; CHECK-LABEL: extelt0_twice_sub_pslli_v4i32: 276; CHECK: # %bb.0: 277; CHECK-NEXT: movd %xmm1, %eax 278; CHECK-NEXT: movd %xmm2, %ecx 279; CHECK-NEXT: subl %ecx, %eax 280; CHECK-NEXT: movd %eax, %xmm1 281; CHECK-NEXT: pslld %xmm1, %xmm0 282; CHECK-NEXT: ret{{[l|q]}} 283 %ext1 = extractelement <4 x i32> %y, i64 0 284 %ext2 = extractelement <4 x i32> %z, i64 0 285 %bo = sub i32 %ext1, %ext2 286 %r = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 %bo) 287 ret <4 x i32> %r 288} 289 290; This would crash because the scalar shift amount has a different type than the shift result. 291 292define <2 x i8> @PR58661(<2 x i8> %a0) { 293; X86-LABEL: PR58661: 294; X86: # %bb.0: 295; X86-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 296; X86-NEXT: retl 297; 298; X64-LABEL: PR58661: 299; X64: # %bb.0: 300; X64-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 301; X64-NEXT: retq 302 %shuffle = shufflevector <2 x i8> %a0, <2 x i8> <i8 poison, i8 0>, <2 x i32> <i32 1, i32 3> 303 %x = bitcast <2 x i8> %shuffle to i16 304 %shl = shl nuw i16 %x, 8 305 %y = bitcast i16 %shl to <2 x i8> 306 ret <2 x i8> %y 307} 308 309declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) 310declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) 311declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) 312declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) 313declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) 314declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) 315declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) 316declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) 317