xref: /llvm-project/llvm/test/CodeGen/X86/blend-of-shift.ll (revision b40532ceb043c04d794f7b9dcf70189580f4f86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X64,X64-SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X64,X64-AVX2
4; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE2,X86,X86-SSE2
5; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX2,X86,X86-AVX2
6
7;------------------------------ 32-bit shuffles -------------------------------;
8
9define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
10; SSE2-LABEL: shuffle_i32_of_shl_i16:
11; SSE2:       # %bb.0:
12; SSE2-NEXT:    psllw $15, %xmm0
13; SSE2-NEXT:    psllw $15, %xmm1
14; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
15; SSE2-NEXT:    movaps %xmm1, %xmm0
16; SSE2-NEXT:    ret{{[l|q]}}
17;
18; AVX2-LABEL: shuffle_i32_of_shl_i16:
19; AVX2:       # %bb.0:
20; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
21; AVX2-NEXT:    vpsllw $15, %xmm1, %xmm1
22; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
23; AVX2-NEXT:    ret{{[l|q]}}
24  %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
25  %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15)
26  %i3 = bitcast <8 x i16> %i1 to <4 x i32>
27  %i4 = bitcast <8 x i16> %i2 to <4 x i32>
28  %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
29  ret <4 x i32> %i5
30}
31define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
32; SSE2-LABEL: shuffle_i32_of_lshr_i16:
33; SSE2:       # %bb.0:
34; SSE2-NEXT:    psrlw $15, %xmm0
35; SSE2-NEXT:    psrlw $15, %xmm1
36; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
37; SSE2-NEXT:    movaps %xmm1, %xmm0
38; SSE2-NEXT:    ret{{[l|q]}}
39;
40; AVX2-LABEL: shuffle_i32_of_lshr_i16:
41; AVX2:       # %bb.0:
42; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
43; AVX2-NEXT:    vpsrlw $15, %xmm1, %xmm1
44; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
45; AVX2-NEXT:    ret{{[l|q]}}
46  %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
47  %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15)
48  %i3 = bitcast <8 x i16> %i1 to <4 x i32>
49  %i4 = bitcast <8 x i16> %i2 to <4 x i32>
50  %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
51  ret <4 x i32> %i5
52}
53define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
54; SSE2-LABEL: shuffle_i32_of_ashr_i16:
55; SSE2:       # %bb.0:
56; SSE2-NEXT:    psraw $15, %xmm0
57; SSE2-NEXT:    psraw $15, %xmm1
58; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
59; SSE2-NEXT:    movaps %xmm1, %xmm0
60; SSE2-NEXT:    ret{{[l|q]}}
61;
62; AVX2-LABEL: shuffle_i32_of_ashr_i16:
63; AVX2:       # %bb.0:
64; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
65; AVX2-NEXT:    vpsraw $15, %xmm1, %xmm1
66; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
67; AVX2-NEXT:    ret{{[l|q]}}
68  %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
69  %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15)
70  %i3 = bitcast <8 x i16> %i1 to <4 x i32>
71  %i4 = bitcast <8 x i16> %i2 to <4 x i32>
72  %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
73  ret <4 x i32> %i5
74}
75
76define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
77; SSE2-LABEL: shuffle_i32_of_shl_i32:
78; SSE2:       # %bb.0:
79; SSE2-NEXT:    pslld $31, %xmm0
80; SSE2-NEXT:    pslld $31, %xmm1
81; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
82; SSE2-NEXT:    movaps %xmm1, %xmm0
83; SSE2-NEXT:    ret{{[l|q]}}
84;
85; AVX2-LABEL: shuffle_i32_of_shl_i32:
86; AVX2:       # %bb.0:
87; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
88; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
89; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
90; AVX2-NEXT:    ret{{[l|q]}}
91  %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
92  %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31)
93  %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
94  ret <4 x i32> %i3
95}
96define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
97; SSE2-LABEL: shuffle_i32_of_lshr_i32:
98; SSE2:       # %bb.0:
99; SSE2-NEXT:    psrld $31, %xmm0
100; SSE2-NEXT:    psrld $31, %xmm1
101; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
102; SSE2-NEXT:    movaps %xmm1, %xmm0
103; SSE2-NEXT:    ret{{[l|q]}}
104;
105; AVX2-LABEL: shuffle_i32_of_lshr_i32:
106; AVX2:       # %bb.0:
107; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
108; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm1
109; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
110; AVX2-NEXT:    ret{{[l|q]}}
111  %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
112  %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31)
113  %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
114  ret <4 x i32> %i3
115}
116define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
117; SSE2-LABEL: shuffle_i32_of_ashr_i32:
118; SSE2:       # %bb.0:
119; SSE2-NEXT:    psrad $31, %xmm0
120; SSE2-NEXT:    psrad $31, %xmm1
121; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
122; SSE2-NEXT:    movaps %xmm1, %xmm0
123; SSE2-NEXT:    ret{{[l|q]}}
124;
125; AVX2-LABEL: shuffle_i32_of_ashr_i32:
126; AVX2:       # %bb.0:
127; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
128; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
129; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
130; AVX2-NEXT:    ret{{[l|q]}}
131  %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
132  %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31)
133  %i3 = shufflevector <4 x i32> %i1, <4 x i32> %i2, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
134  ret <4 x i32> %i3
135}
136
137define <4 x i32> @shuffle_i32_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
138; SSE2-LABEL: shuffle_i32_of_shl_i64:
139; SSE2:       # %bb.0:
140; SSE2-NEXT:    psllq $63, %xmm0
141; SSE2-NEXT:    psllq $63, %xmm1
142; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
143; SSE2-NEXT:    movaps %xmm1, %xmm0
144; SSE2-NEXT:    ret{{[l|q]}}
145;
146; AVX2-LABEL: shuffle_i32_of_shl_i64:
147; AVX2:       # %bb.0:
148; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
149; AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
150; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
151; AVX2-NEXT:    ret{{[l|q]}}
152  %i1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %x, i32 63)
153  %i2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %y, i32 63)
154  %i3 = bitcast <2 x i64> %i1 to <4 x i32>
155  %i4 = bitcast <2 x i64> %i2 to <4 x i32>
156  %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
157  ret <4 x i32> %i5
158}
159define <4 x i32> @shuffle_i32_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
160; SSE2-LABEL: shuffle_i32_of_lshr_i64:
161; SSE2:       # %bb.0:
162; SSE2-NEXT:    psrlq $63, %xmm0
163; SSE2-NEXT:    psrlq $63, %xmm1
164; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0]
165; SSE2-NEXT:    movaps %xmm1, %xmm0
166; SSE2-NEXT:    ret{{[l|q]}}
167;
168; AVX2-LABEL: shuffle_i32_of_lshr_i64:
169; AVX2:       # %bb.0:
170; AVX2-NEXT:    vpsrlq $63, %xmm0, %xmm0
171; AVX2-NEXT:    vpsrlq $63, %xmm1, %xmm1
172; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0]
173; AVX2-NEXT:    ret{{[l|q]}}
174  %i1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %x, i32 63)
175  %i2 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %y, i32 63)
176  %i3 = bitcast <2 x i64> %i1 to <4 x i32>
177  %i4 = bitcast <2 x i64> %i2 to <4 x i32>
178  %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
179  ret <4 x i32> %i5
180}
181define <4 x i32> @shuffle_i32_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
182; X64-SSE2-LABEL: shuffle_i32_of_ashr_i64:
183; X64-SSE2:       # %bb.0:
184; X64-SSE2-NEXT:    subq $40, %rsp
185; X64-SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
186; X64-SSE2-NEXT:    movl $63, %edi
187; X64-SSE2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
188; X64-SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
189; X64-SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
190; X64-SSE2-NEXT:    movl $63, %edi
191; X64-SSE2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
192; X64-SSE2-NEXT:    shufps $27, (%rsp), %xmm0 # 16-byte Folded Reload
193; X64-SSE2-NEXT:    # xmm0 = xmm0[3,2],mem[1,0]
194; X64-SSE2-NEXT:    addq $40, %rsp
195; X64-SSE2-NEXT:    retq
196;
197; X64-AVX2-LABEL: shuffle_i32_of_ashr_i64:
198; X64-AVX2:       # %bb.0:
199; X64-AVX2-NEXT:    subq $40, %rsp
200; X64-AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
201; X64-AVX2-NEXT:    movl $63, %edi
202; X64-AVX2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
203; X64-AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
204; X64-AVX2-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
205; X64-AVX2-NEXT:    movl $63, %edi
206; X64-AVX2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
207; X64-AVX2-NEXT:    vshufps $27, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
208; X64-AVX2-NEXT:    # xmm0 = xmm0[3,2],mem[1,0]
209; X64-AVX2-NEXT:    addq $40, %rsp
210; X64-AVX2-NEXT:    retq
211;
212; X86-SSE2-LABEL: shuffle_i32_of_ashr_i64:
213; X86-SSE2:       # %bb.0:
214; X86-SSE2-NEXT:    subl $32, %esp
215; X86-SSE2-NEXT:    movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
216; X86-SSE2-NEXT:    pushl $63
217; X86-SSE2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
218; X86-SSE2-NEXT:    addl $4, %esp
219; X86-SSE2-NEXT:    movups %xmm0, (%esp) # 16-byte Spill
220; X86-SSE2-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
221; X86-SSE2-NEXT:    pushl $63
222; X86-SSE2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
223; X86-SSE2-NEXT:    addl $4, %esp
224; X86-SSE2-NEXT:    movups (%esp), %xmm1 # 16-byte Reload
225; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,2],xmm1[1,0]
226; X86-SSE2-NEXT:    addl $32, %esp
227; X86-SSE2-NEXT:    retl
228;
229; X86-AVX2-LABEL: shuffle_i32_of_ashr_i64:
230; X86-AVX2:       # %bb.0:
231; X86-AVX2-NEXT:    subl $32, %esp
232; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
233; X86-AVX2-NEXT:    pushl $63
234; X86-AVX2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
235; X86-AVX2-NEXT:    addl $4, %esp
236; X86-AVX2-NEXT:    vmovups %xmm0, (%esp) # 16-byte Spill
237; X86-AVX2-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
238; X86-AVX2-NEXT:    pushl $63
239; X86-AVX2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
240; X86-AVX2-NEXT:    addl $4, %esp
241; X86-AVX2-NEXT:    vshufps $27, (%esp), %xmm0, %xmm0 # 16-byte Folded Reload
242; X86-AVX2-NEXT:    # xmm0 = xmm0[3,2],mem[1,0]
243; X86-AVX2-NEXT:    addl $32, %esp
244; X86-AVX2-NEXT:    retl
245  %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63)
246  %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63)
247  %i3 = bitcast <2 x i64> %i1 to <4 x i32>
248  %i4 = bitcast <2 x i64> %i2 to <4 x i32>
249  %i5 = shufflevector <4 x i32> %i3, <4 x i32> %i4, <4 x i32> <i32 7, i32 6, i32 1, i32 0>
250  ret <4 x i32> %i5
251}
252
253;------------------------------ 64-bit shuffles -------------------------------;
254
255define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
256; SSE2-LABEL: shuffle_i64_of_shl_i16:
257; SSE2:       # %bb.0:
258; SSE2-NEXT:    psllw $15, %xmm0
259; SSE2-NEXT:    psllw $15, %xmm1
260; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
261; SSE2-NEXT:    movaps %xmm1, %xmm0
262; SSE2-NEXT:    ret{{[l|q]}}
263;
264; AVX2-LABEL: shuffle_i64_of_shl_i16:
265; AVX2:       # %bb.0:
266; AVX2-NEXT:    vpsllw $15, %xmm0, %xmm0
267; AVX2-NEXT:    vpsllw $15, %xmm1, %xmm1
268; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
269; AVX2-NEXT:    ret{{[l|q]}}
270  %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15)
271  %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15)
272  %i3 = bitcast <8 x i16> %i1 to <2 x i64>
273  %i4 = bitcast <8 x i16> %i2 to <2 x i64>
274  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
275  ret <2 x i64> %i5
276}
277define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
278; SSE2-LABEL: shuffle_i64_of_lshr_i16:
279; SSE2:       # %bb.0:
280; SSE2-NEXT:    psrlw $15, %xmm0
281; SSE2-NEXT:    psrlw $15, %xmm1
282; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
283; SSE2-NEXT:    movaps %xmm1, %xmm0
284; SSE2-NEXT:    ret{{[l|q]}}
285;
286; AVX2-LABEL: shuffle_i64_of_lshr_i16:
287; AVX2:       # %bb.0:
288; AVX2-NEXT:    vpsrlw $15, %xmm0, %xmm0
289; AVX2-NEXT:    vpsrlw $15, %xmm1, %xmm1
290; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
291; AVX2-NEXT:    ret{{[l|q]}}
292  %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15)
293  %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15)
294  %i3 = bitcast <8 x i16> %i1 to <2 x i64>
295  %i4 = bitcast <8 x i16> %i2 to <2 x i64>
296  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
297  ret <2 x i64> %i5
298}
299define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind {
300; SSE2-LABEL: shuffle_i64_of_ashr_i16:
301; SSE2:       # %bb.0:
302; SSE2-NEXT:    psraw $15, %xmm0
303; SSE2-NEXT:    psraw $15, %xmm1
304; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
305; SSE2-NEXT:    movaps %xmm1, %xmm0
306; SSE2-NEXT:    ret{{[l|q]}}
307;
308; AVX2-LABEL: shuffle_i64_of_ashr_i16:
309; AVX2:       # %bb.0:
310; AVX2-NEXT:    vpsraw $15, %xmm0, %xmm0
311; AVX2-NEXT:    vpsraw $15, %xmm1, %xmm1
312; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
313; AVX2-NEXT:    ret{{[l|q]}}
314  %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15)
315  %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15)
316  %i3 = bitcast <8 x i16> %i1 to <2 x i64>
317  %i4 = bitcast <8 x i16> %i2 to <2 x i64>
318  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
319  ret <2 x i64> %i5
320}
321
322define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
323; SSE2-LABEL: shuffle_i64_of_shl_i32:
324; SSE2:       # %bb.0:
325; SSE2-NEXT:    pslld $31, %xmm0
326; SSE2-NEXT:    pslld $31, %xmm1
327; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
328; SSE2-NEXT:    movaps %xmm1, %xmm0
329; SSE2-NEXT:    ret{{[l|q]}}
330;
331; AVX2-LABEL: shuffle_i64_of_shl_i32:
332; AVX2:       # %bb.0:
333; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
334; AVX2-NEXT:    vpslld $31, %xmm1, %xmm1
335; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
336; AVX2-NEXT:    ret{{[l|q]}}
337  %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31)
338  %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31)
339  %i3 = bitcast <4 x i32> %i1 to <2 x i64>
340  %i4 = bitcast <4 x i32> %i2 to <2 x i64>
341  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
342  ret <2 x i64> %i5
343}
344define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
345; SSE2-LABEL: shuffle_i64_of_lshr_i32:
346; SSE2:       # %bb.0:
347; SSE2-NEXT:    psrld $31, %xmm0
348; SSE2-NEXT:    psrld $31, %xmm1
349; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
350; SSE2-NEXT:    movaps %xmm1, %xmm0
351; SSE2-NEXT:    ret{{[l|q]}}
352;
353; AVX2-LABEL: shuffle_i64_of_lshr_i32:
354; AVX2:       # %bb.0:
355; AVX2-NEXT:    vpsrld $31, %xmm0, %xmm0
356; AVX2-NEXT:    vpsrld $31, %xmm1, %xmm1
357; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
358; AVX2-NEXT:    ret{{[l|q]}}
359  %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31)
360  %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31)
361  %i3 = bitcast <4 x i32> %i1 to <2 x i64>
362  %i4 = bitcast <4 x i32> %i2 to <2 x i64>
363  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
364  ret <2 x i64> %i5
365}
366define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind {
367; SSE2-LABEL: shuffle_i64_of_ashr_i32:
368; SSE2:       # %bb.0:
369; SSE2-NEXT:    psrad $31, %xmm0
370; SSE2-NEXT:    psrad $31, %xmm1
371; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
372; SSE2-NEXT:    movaps %xmm1, %xmm0
373; SSE2-NEXT:    ret{{[l|q]}}
374;
375; AVX2-LABEL: shuffle_i64_of_ashr_i32:
376; AVX2:       # %bb.0:
377; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm0
378; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm1
379; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
380; AVX2-NEXT:    ret{{[l|q]}}
381  %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31)
382  %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31)
383  %i3 = bitcast <4 x i32> %i1 to <2 x i64>
384  %i4 = bitcast <4 x i32> %i2 to <2 x i64>
385  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
386  ret <2 x i64> %i5
387}
388
389define <2 x i64> @shuffle_i64_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
390; SSE2-LABEL: shuffle_i64_of_shl_i64:
391; SSE2:       # %bb.0:
392; SSE2-NEXT:    psllq $63, %xmm0
393; SSE2-NEXT:    psllq $63, %xmm1
394; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
395; SSE2-NEXT:    movaps %xmm1, %xmm0
396; SSE2-NEXT:    ret{{[l|q]}}
397;
398; AVX2-LABEL: shuffle_i64_of_shl_i64:
399; AVX2:       # %bb.0:
400; AVX2-NEXT:    vpsllq $63, %xmm0, %xmm0
401; AVX2-NEXT:    vpsllq $63, %xmm1, %xmm1
402; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
403; AVX2-NEXT:    ret{{[l|q]}}
404  %i1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %x, i32 63)
405  %i2 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %y, i32 63)
406  %i3 = bitcast <2 x i64> %i1 to <2 x i64>
407  %i4 = bitcast <2 x i64> %i2 to <2 x i64>
408  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
409  ret <2 x i64> %i5
410}
411define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
412; SSE2-LABEL: shuffle_i64_of_lshr_i64:
413; SSE2:       # %bb.0:
414; SSE2-NEXT:    psrlq $63, %xmm0
415; SSE2-NEXT:    psrlq $63, %xmm1
416; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1]
417; SSE2-NEXT:    movaps %xmm1, %xmm0
418; SSE2-NEXT:    ret{{[l|q]}}
419;
420; AVX2-LABEL: shuffle_i64_of_lshr_i64:
421; AVX2:       # %bb.0:
422; AVX2-NEXT:    vpsrlq $63, %xmm0, %xmm0
423; AVX2-NEXT:    vpsrlq $63, %xmm1, %xmm1
424; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
425; AVX2-NEXT:    ret{{[l|q]}}
426  %i1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %x, i32 63)
427  %i2 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %y, i32 63)
428  %i3 = bitcast <2 x i64> %i1 to <2 x i64>
429  %i4 = bitcast <2 x i64> %i2 to <2 x i64>
430  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
431  ret <2 x i64> %i5
432}
433define <2 x i64> @shuffle_i64_of_ashr_i64(<2 x i64> %x, <2 x i64> %y) nounwind {
434; X64-SSE2-LABEL: shuffle_i64_of_ashr_i64:
435; X64-SSE2:       # %bb.0:
436; X64-SSE2-NEXT:    subq $40, %rsp
437; X64-SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
438; X64-SSE2-NEXT:    movl $63, %edi
439; X64-SSE2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
440; X64-SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
441; X64-SSE2-NEXT:    movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
442; X64-SSE2-NEXT:    movl $63, %edi
443; X64-SSE2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
444; X64-SSE2-NEXT:    shufpd $1, (%rsp), %xmm0 # 16-byte Folded Reload
445; X64-SSE2-NEXT:    # xmm0 = xmm0[1],mem[0]
446; X64-SSE2-NEXT:    addq $40, %rsp
447; X64-SSE2-NEXT:    retq
448;
449; X64-AVX2-LABEL: shuffle_i64_of_ashr_i64:
450; X64-AVX2:       # %bb.0:
451; X64-AVX2-NEXT:    subq $40, %rsp
452; X64-AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
453; X64-AVX2-NEXT:    movl $63, %edi
454; X64-AVX2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
455; X64-AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
456; X64-AVX2-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
457; X64-AVX2-NEXT:    movl $63, %edi
458; X64-AVX2-NEXT:    callq llvm.x86.sse2.psrai.q@PLT
459; X64-AVX2-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
460; X64-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
461; X64-AVX2-NEXT:    addq $40, %rsp
462; X64-AVX2-NEXT:    retq
463;
464; X86-SSE2-LABEL: shuffle_i64_of_ashr_i64:
465; X86-SSE2:       # %bb.0:
466; X86-SSE2-NEXT:    subl $32, %esp
467; X86-SSE2-NEXT:    movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
468; X86-SSE2-NEXT:    pushl $63
469; X86-SSE2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
470; X86-SSE2-NEXT:    addl $4, %esp
471; X86-SSE2-NEXT:    movups %xmm0, (%esp) # 16-byte Spill
472; X86-SSE2-NEXT:    movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
473; X86-SSE2-NEXT:    pushl $63
474; X86-SSE2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
475; X86-SSE2-NEXT:    addl $4, %esp
476; X86-SSE2-NEXT:    movups (%esp), %xmm1 # 16-byte Reload
477; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,3],xmm1[0,1]
478; X86-SSE2-NEXT:    addl $32, %esp
479; X86-SSE2-NEXT:    retl
480;
481; X86-AVX2-LABEL: shuffle_i64_of_ashr_i64:
482; X86-AVX2:       # %bb.0:
483; X86-AVX2-NEXT:    subl $32, %esp
484; X86-AVX2-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
485; X86-AVX2-NEXT:    pushl $63
486; X86-AVX2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
487; X86-AVX2-NEXT:    addl $4, %esp
488; X86-AVX2-NEXT:    vmovups %xmm0, (%esp) # 16-byte Spill
489; X86-AVX2-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
490; X86-AVX2-NEXT:    pushl $63
491; X86-AVX2-NEXT:    calll llvm.x86.sse2.psrai.q@PLT
492; X86-AVX2-NEXT:    addl $4, %esp
493; X86-AVX2-NEXT:    vmovdqu (%esp), %xmm1 # 16-byte Reload
494; X86-AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
495; X86-AVX2-NEXT:    addl $32, %esp
496; X86-AVX2-NEXT:    retl
497  %i1 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %x, i32 63)
498  %i2 = tail call <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64> %y, i32 63)
499  %i3 = bitcast <2 x i64> %i1 to <2 x i64>
500  %i4 = bitcast <2 x i64> %i2 to <2 x i64>
501  %i5 = shufflevector <2 x i64> %i3, <2 x i64> %i4, <2 x i32> <i32 3, i32 0>
502  ret <2 x i64> %i5
503}
504
505declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
506declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
507declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
508declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32)
509declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32)
510declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
511declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32)
512declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32)
513declare <2 x i64> @llvm.x86.sse2.psrai.q(<2 x i64>, i32) ; does not exist
514;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
515; CHECK: {{.*}}
516; X64: {{.*}}
517; X86: {{.*}}
518