1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X64,X64-SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X64,X64-AVX2 4; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X86,X86-SSE2 5; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X86,X86-AVX2 6 7;------------------------------ 32-bit shuffles -------------------------------; 8 9define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind { 10; SSE2-LABEL: shuffle_i32_of_shl_i16: 11; SSE2: # %bb.0: 12; SSE2-NEXT: psllw $15, %xmm0 13; SSE2-NEXT: psllw $15, %xmm1 14; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 15; SSE2-NEXT: movaps %xmm1, %xmm0 16; SSE2-NEXT: ret{{[l|q]}} 17; 18; AVX2-LABEL: shuffle_i32_of_shl_i16: 19; AVX2: # %bb.0: 20; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 21; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 22; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 23; AVX2-NEXT: ret{{[l|q]}} 24 %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15) 25 %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15) 26 %i3 = bitcast <8 x i16> %i1 to <4 x i32> 27 %i4 = bitcast <8 x i16> %i2 to <4 x i32> 28 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 29 ret <4 x i32> %i5 30} 31define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { 32; SSE2-LABEL: shuffle_i32_of_lshr_i16: 33; SSE2: # %bb.0: 34; SSE2-NEXT: psrlw $15, %xmm0 35; SSE2-NEXT: psrlw $15, %xmm1 36; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 37; SSE2-NEXT: movaps %xmm1, %xmm0 38; SSE2-NEXT: ret{{[l|q]}} 39; 40; AVX2-LABEL: shuffle_i32_of_lshr_i16: 41; AVX2: # %bb.0: 42; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 43; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1 44; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 45; AVX2-NEXT: ret{{[l|q]}} 46 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) 47 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15) 48 %i3 = bitcast <8 x i16> %i1 to <4 x i32> 49 %i4 = bitcast <8 x i16> %i2 to <4 x i32> 50 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 51 ret <4 x i32> %i5 52} 53define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { 54; SSE2-LABEL: shuffle_i32_of_ashr_i16: 55; SSE2: # %bb.0: 56; SSE2-NEXT: psraw $15, %xmm0 57; SSE2-NEXT: psraw $15, %xmm1 58; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 59; SSE2-NEXT: movaps %xmm1, %xmm0 60; SSE2-NEXT: ret{{[l|q]}} 61; 62; AVX2-LABEL: shuffle_i32_of_ashr_i16: 63; AVX2: # %bb.0: 64; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 65; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 66; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 67; AVX2-NEXT: ret{{[l|q]}} 68 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15) 69 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15) 70 %i3 = bitcast <8 x i16> %i1 to <4 x i32> 71 %i4 = bitcast <8 x i16> %i2 to <4 x i32> 72 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 73 ret <4 x i32> %i5 74} 75 76define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind { 77; SSE2-LABEL: shuffle_i32_of_shl_i32: 78; SSE2: # %bb.0: 79; SSE2-NEXT: pslld $31, %xmm0 80; SSE2-NEXT: pslld $31, %xmm1 81; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 82; SSE2-NEXT: movaps %xmm1, %xmm0 83; SSE2-NEXT: ret{{[l|q]}} 84; 85; AVX2-LABEL: shuffle_i32_of_shl_i32: 86; AVX2: # %bb.0: 87; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 88; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 89; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 90; AVX2-NEXT: ret{{[l|q]}} 91 %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31) 92 %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31) 93 %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 94 ret <4 x i32> %i3 95} 96define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { 97; SSE2-LABEL: shuffle_i32_of_lshr_i32: 98; SSE2: # %bb.0: 99; SSE2-NEXT: psrld $31, %xmm0 100; SSE2-NEXT: psrld $31, %xmm1 101; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 102; SSE2-NEXT: movaps %xmm1, %xmm0 103; SSE2-NEXT: ret{{[l|q]}} 104; 105; AVX2-LABEL: shuffle_i32_of_lshr_i32: 106; AVX2: # %bb.0: 107; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 108; AVX2-NEXT: vpsrld $31, %xmm1, %xmm1 109; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 110; AVX2-NEXT: ret{{[l|q]}} 111 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31) 112 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31) 113 %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 114 ret <4 x i32> %i3 115} 116define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { 117; SSE2-LABEL: shuffle_i32_of_ashr_i32: 118; SSE2: # %bb.0: 119; SSE2-NEXT: psrad $31, %xmm0 120; SSE2-NEXT: psrad $31, %xmm1 121; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 122; SSE2-NEXT: movaps %xmm1, %xmm0 123; SSE2-NEXT: ret{{[l|q]}} 124; 125; AVX2-LABEL: shuffle_i32_of_ashr_i32: 126; AVX2: # %bb.0: 127; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 128; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 129; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 130; AVX2-NEXT: ret{{[l|q]}} 131 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31) 132 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31) 133 %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 134 ret <4 x i32> %i3 135} 136 137define <4 x i32> @shuffle_i32_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind { 138; SSE2-LABEL: shuffle_i32_of_shl_i64: 139; SSE2: # %bb.0: 140; SSE2-NEXT: psllq $63, %xmm0 141; SSE2-NEXT: psllq $63, %xmm1 142; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 143; SSE2-NEXT: movaps %xmm1, %xmm0 144; SSE2-NEXT: ret{{[l|q]}} 145; 146; AVX2-LABEL: shuffle_i32_of_shl_i64: 147; AVX2: # %bb.0: 148; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 149; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 150; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 151; AVX2-NEXT: ret{{[l|q]}} 152 %i1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %x, i32 63) 153 %i2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %y, i32 63) 154 %i3 = bitcast <2 x i64> %i1 to <4 x i32> 155 %i4 = bitcast <2 x i64> %i2 to <4 x i32> 156 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 157 ret <4 x i32> %i5 158} 159define <4 x i32> @shuffle_i32_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { 160; SSE2-LABEL: shuffle_i32_of_lshr_i64: 161; SSE2: # %bb.0: 162; SSE2-NEXT: psrlq $63, %xmm0 163; SSE2-NEXT: psrlq $63, %xmm1 164; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] 165; SSE2-NEXT: movaps %xmm1, %xmm0 166; SSE2-NEXT: ret{{[l|q]}} 167; 168; AVX2-LABEL: shuffle_i32_of_lshr_i64: 169; AVX2: # %bb.0: 170; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0 171; AVX2-NEXT: vpsrlq $63, %xmm1, %xmm1 172; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] 173; AVX2-NEXT: ret{{[l|q]}} 174 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %x, i32 63) 175 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %y, i32 63) 176 %i3 = bitcast <2 x i64> %i1 to <4 x i32> 177 %i4 = bitcast <2 x i64> %i2 to <4 x i32> 178 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 179 ret <4 x i32> %i5 180} 181define <4 x i32> @shuffle_i32_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { 182; X64-SSE2-LABEL: shuffle_i32_of_ashr_i64: 183; X64-SSE2: # %bb.0: 184; X64-SSE2-NEXT: subq $40, %rsp 185; X64-SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 186; X64-SSE2-NEXT: movl $63, %edi 187; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 188; X64-SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 189; X64-SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 190; X64-SSE2-NEXT: movl $63, %edi 191; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 192; X64-SSE2-NEXT: shufps $27, (%rsp), %xmm0 # 16-byte Folded Reload 193; X64-SSE2-NEXT: # xmm0 = xmm0[3,2],mem[1,0] 194; X64-SSE2-NEXT: addq $40, %rsp 195; X64-SSE2-NEXT: retq 196; 197; X64-AVX2-LABEL: shuffle_i32_of_ashr_i64: 198; X64-AVX2: # %bb.0: 199; X64-AVX2-NEXT: subq $40, %rsp 200; X64-AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 201; X64-AVX2-NEXT: movl $63, %edi 202; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 203; X64-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 204; X64-AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 205; X64-AVX2-NEXT: movl $63, %edi 206; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 207; X64-AVX2-NEXT: vshufps $27, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload 208; X64-AVX2-NEXT: # xmm0 = xmm0[3,2],mem[1,0] 209; X64-AVX2-NEXT: addq $40, %rsp 210; X64-AVX2-NEXT: retq 211; 212; X86-SSE2-LABEL: shuffle_i32_of_ashr_i64: 213; X86-SSE2: # %bb.0: 214; X86-SSE2-NEXT: subl $32, %esp 215; X86-SSE2-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 216; X86-SSE2-NEXT: pushl $63 217; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 218; X86-SSE2-NEXT: addl $4, %esp 219; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill 220; X86-SSE2-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 221; X86-SSE2-NEXT: pushl $63 222; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 223; X86-SSE2-NEXT: addl $4, %esp 224; X86-SSE2-NEXT: movups (%esp), %xmm1 # 16-byte Reload 225; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[1,0] 226; X86-SSE2-NEXT: addl $32, %esp 227; X86-SSE2-NEXT: retl 228; 229; X86-AVX2-LABEL: shuffle_i32_of_ashr_i64: 230; X86-AVX2: # %bb.0: 231; X86-AVX2-NEXT: subl $32, %esp 232; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 233; X86-AVX2-NEXT: pushl $63 234; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 235; X86-AVX2-NEXT: addl $4, %esp 236; X86-AVX2-NEXT: vmovups %xmm0, (%esp) # 16-byte Spill 237; X86-AVX2-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 238; X86-AVX2-NEXT: pushl $63 239; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 240; X86-AVX2-NEXT: addl $4, %esp 241; X86-AVX2-NEXT: vshufps $27, (%esp), %xmm0, %xmm0 # 16-byte Folded Reload 242; X86-AVX2-NEXT: # xmm0 = xmm0[3,2],mem[1,0] 243; X86-AVX2-NEXT: addl $32, %esp 244; X86-AVX2-NEXT: retl 245 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) 246 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63) 247 %i3 = bitcast <2 x i64> %i1 to <4 x i32> 248 %i4 = bitcast <2 x i64> %i2 to <4 x i32> 249 %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0> 250 ret <4 x i32> %i5 251} 252 253;------------------------------ 64-bit shuffles -------------------------------; 254 255define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind { 256; SSE2-LABEL: shuffle_i64_of_shl_i16: 257; SSE2: # %bb.0: 258; SSE2-NEXT: psllw $15, %xmm0 259; SSE2-NEXT: psllw $15, %xmm1 260; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 261; SSE2-NEXT: movaps %xmm1, %xmm0 262; SSE2-NEXT: ret{{[l|q]}} 263; 264; AVX2-LABEL: shuffle_i64_of_shl_i16: 265; AVX2: # %bb.0: 266; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 267; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 268; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 269; AVX2-NEXT: ret{{[l|q]}} 270 %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15) 271 %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15) 272 %i3 = bitcast <8 x i16> %i1 to <2 x i64> 273 %i4 = bitcast <8 x i16> %i2 to <2 x i64> 274 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 275 ret <2 x i64> %i5 276} 277define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { 278; SSE2-LABEL: shuffle_i64_of_lshr_i16: 279; SSE2: # %bb.0: 280; SSE2-NEXT: psrlw $15, %xmm0 281; SSE2-NEXT: psrlw $15, %xmm1 282; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 283; SSE2-NEXT: movaps %xmm1, %xmm0 284; SSE2-NEXT: ret{{[l|q]}} 285; 286; AVX2-LABEL: shuffle_i64_of_lshr_i16: 287; AVX2: # %bb.0: 288; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 289; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1 290; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 291; AVX2-NEXT: ret{{[l|q]}} 292 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) 293 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15) 294 %i3 = bitcast <8 x i16> %i1 to <2 x i64> 295 %i4 = bitcast <8 x i16> %i2 to <2 x i64> 296 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 297 ret <2 x i64> %i5 298} 299define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { 300; SSE2-LABEL: shuffle_i64_of_ashr_i16: 301; SSE2: # %bb.0: 302; SSE2-NEXT: psraw $15, %xmm0 303; SSE2-NEXT: psraw $15, %xmm1 304; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 305; SSE2-NEXT: movaps %xmm1, %xmm0 306; SSE2-NEXT: ret{{[l|q]}} 307; 308; AVX2-LABEL: shuffle_i64_of_ashr_i16: 309; AVX2: # %bb.0: 310; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 311; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 312; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 313; AVX2-NEXT: ret{{[l|q]}} 314 %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15) 315 %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15) 316 %i3 = bitcast <8 x i16> %i1 to <2 x i64> 317 %i4 = bitcast <8 x i16> %i2 to <2 x i64> 318 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 319 ret <2 x i64> %i5 320} 321 322define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind { 323; SSE2-LABEL: shuffle_i64_of_shl_i32: 324; SSE2: # %bb.0: 325; SSE2-NEXT: pslld $31, %xmm0 326; SSE2-NEXT: pslld $31, %xmm1 327; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 328; SSE2-NEXT: movaps %xmm1, %xmm0 329; SSE2-NEXT: ret{{[l|q]}} 330; 331; AVX2-LABEL: shuffle_i64_of_shl_i32: 332; AVX2: # %bb.0: 333; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 334; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 335; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 336; AVX2-NEXT: ret{{[l|q]}} 337 %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31) 338 %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31) 339 %i3 = bitcast <4 x i32> %i1 to <2 x i64> 340 %i4 = bitcast <4 x i32> %i2 to <2 x i64> 341 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 342 ret <2 x i64> %i5 343} 344define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { 345; SSE2-LABEL: shuffle_i64_of_lshr_i32: 346; SSE2: # %bb.0: 347; SSE2-NEXT: psrld $31, %xmm0 348; SSE2-NEXT: psrld $31, %xmm1 349; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 350; SSE2-NEXT: movaps %xmm1, %xmm0 351; SSE2-NEXT: ret{{[l|q]}} 352; 353; AVX2-LABEL: shuffle_i64_of_lshr_i32: 354; AVX2: # %bb.0: 355; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 356; AVX2-NEXT: vpsrld $31, %xmm1, %xmm1 357; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 358; AVX2-NEXT: ret{{[l|q]}} 359 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31) 360 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31) 361 %i3 = bitcast <4 x i32> %i1 to <2 x i64> 362 %i4 = bitcast <4 x i32> %i2 to <2 x i64> 363 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 364 ret <2 x i64> %i5 365} 366define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { 367; SSE2-LABEL: shuffle_i64_of_ashr_i32: 368; SSE2: # %bb.0: 369; SSE2-NEXT: psrad $31, %xmm0 370; SSE2-NEXT: psrad $31, %xmm1 371; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 372; SSE2-NEXT: movaps %xmm1, %xmm0 373; SSE2-NEXT: ret{{[l|q]}} 374; 375; AVX2-LABEL: shuffle_i64_of_ashr_i32: 376; AVX2: # %bb.0: 377; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 378; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 379; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 380; AVX2-NEXT: ret{{[l|q]}} 381 %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31) 382 %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31) 383 %i3 = bitcast <4 x i32> %i1 to <2 x i64> 384 %i4 = bitcast <4 x i32> %i2 to <2 x i64> 385 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 386 ret <2 x i64> %i5 387} 388 389define <2 x i64> @shuffle_i64_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind { 390; SSE2-LABEL: shuffle_i64_of_shl_i64: 391; SSE2: # %bb.0: 392; SSE2-NEXT: psllq $63, %xmm0 393; SSE2-NEXT: psllq $63, %xmm1 394; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 395; SSE2-NEXT: movaps %xmm1, %xmm0 396; SSE2-NEXT: ret{{[l|q]}} 397; 398; AVX2-LABEL: shuffle_i64_of_shl_i64: 399; AVX2: # %bb.0: 400; AVX2-NEXT: vpsllq $63, %xmm0, %xmm0 401; AVX2-NEXT: vpsllq $63, %xmm1, %xmm1 402; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 403; AVX2-NEXT: ret{{[l|q]}} 404 %i1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %x, i32 63) 405 %i2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %y, i32 63) 406 %i3 = bitcast <2 x i64> %i1 to <2 x i64> 407 %i4 = bitcast <2 x i64> %i2 to <2 x i64> 408 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 409 ret <2 x i64> %i5 410} 411define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { 412; SSE2-LABEL: shuffle_i64_of_lshr_i64: 413; SSE2: # %bb.0: 414; SSE2-NEXT: psrlq $63, %xmm0 415; SSE2-NEXT: psrlq $63, %xmm1 416; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] 417; SSE2-NEXT: movaps %xmm1, %xmm0 418; SSE2-NEXT: ret{{[l|q]}} 419; 420; AVX2-LABEL: shuffle_i64_of_lshr_i64: 421; AVX2: # %bb.0: 422; AVX2-NEXT: vpsrlq $63, %xmm0, %xmm0 423; AVX2-NEXT: vpsrlq $63, %xmm1, %xmm1 424; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7] 425; AVX2-NEXT: ret{{[l|q]}} 426 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %x, i32 63) 427 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %y, i32 63) 428 %i3 = bitcast <2 x i64> %i1 to <2 x i64> 429 %i4 = bitcast <2 x i64> %i2 to <2 x i64> 430 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 431 ret <2 x i64> %i5 432} 433define <2 x i64> @shuffle_i64_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { 434; X64-SSE2-LABEL: shuffle_i64_of_ashr_i64: 435; X64-SSE2: # %bb.0: 436; X64-SSE2-NEXT: subq $40, %rsp 437; X64-SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 438; X64-SSE2-NEXT: movl $63, %edi 439; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 440; X64-SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 441; X64-SSE2-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 442; X64-SSE2-NEXT: movl $63, %edi 443; X64-SSE2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 444; X64-SSE2-NEXT: shufpd $1, (%rsp), %xmm0 # 16-byte Folded Reload 445; X64-SSE2-NEXT: # xmm0 = xmm0[1],mem[0] 446; X64-SSE2-NEXT: addq $40, %rsp 447; X64-SSE2-NEXT: retq 448; 449; X64-AVX2-LABEL: shuffle_i64_of_ashr_i64: 450; X64-AVX2: # %bb.0: 451; X64-AVX2-NEXT: subq $40, %rsp 452; X64-AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 453; X64-AVX2-NEXT: movl $63, %edi 454; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 455; X64-AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 456; X64-AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 457; X64-AVX2-NEXT: movl $63, %edi 458; X64-AVX2-NEXT: callq llvm.x86.sse2.psrai.q@PLT 459; X64-AVX2-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 460; X64-AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 461; X64-AVX2-NEXT: addq $40, %rsp 462; X64-AVX2-NEXT: retq 463; 464; X86-SSE2-LABEL: shuffle_i64_of_ashr_i64: 465; X86-SSE2: # %bb.0: 466; X86-SSE2-NEXT: subl $32, %esp 467; X86-SSE2-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 468; X86-SSE2-NEXT: pushl $63 469; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 470; X86-SSE2-NEXT: addl $4, %esp 471; X86-SSE2-NEXT: movups %xmm0, (%esp) # 16-byte Spill 472; X86-SSE2-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 473; X86-SSE2-NEXT: pushl $63 474; X86-SSE2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 475; X86-SSE2-NEXT: addl $4, %esp 476; X86-SSE2-NEXT: movups (%esp), %xmm1 # 16-byte Reload 477; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1] 478; X86-SSE2-NEXT: addl $32, %esp 479; X86-SSE2-NEXT: retl 480; 481; X86-AVX2-LABEL: shuffle_i64_of_ashr_i64: 482; X86-AVX2: # %bb.0: 483; X86-AVX2-NEXT: subl $32, %esp 484; X86-AVX2-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 485; X86-AVX2-NEXT: pushl $63 486; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 487; X86-AVX2-NEXT: addl $4, %esp 488; X86-AVX2-NEXT: vmovups %xmm0, (%esp) # 16-byte Spill 489; X86-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 490; X86-AVX2-NEXT: pushl $63 491; X86-AVX2-NEXT: calll llvm.x86.sse2.psrai.q@PLT 492; X86-AVX2-NEXT: addl $4, %esp 493; X86-AVX2-NEXT: vmovdqu (%esp), %xmm1 # 16-byte Reload 494; X86-AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 495; X86-AVX2-NEXT: addl $32, %esp 496; X86-AVX2-NEXT: retl 497 %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63) 498 %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63) 499 %i3 = bitcast <2 x i64> %i1 to <2 x i64> 500 %i4 = bitcast <2 x i64> %i2 to <2 x i64> 501 %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0> 502 ret <2 x i64> %i5 503} 504 505declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) 506declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) 507declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) 508declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) 509declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) 510declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) 511declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) 512declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) 513declare <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64>, i32) ; does not exist 514;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: 515; CHECK: {{.*}} 516; X64: {{.*}} 517; X86: {{.*}} 518