xref: /llvm-project/llvm/test/CodeGen/X86/combine-sra.ll (revision f0b3b6d15b2c0ee2cff2dd31dc075adb5d9a4ff7)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-SLOW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=CHECK,AVX,AVX2,AVX2-FAST-PERLANE
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
8
9; fold (sra 0, x) -> 0
10define <4 x i32> @combine_vec_ashr_zero(<4 x i32> %x) {
11; SSE-LABEL: combine_vec_ashr_zero:
12; SSE:       # %bb.0:
13; SSE-NEXT:    xorps %xmm0, %xmm0
14; SSE-NEXT:    retq
15;
16; AVX-LABEL: combine_vec_ashr_zero:
17; AVX:       # %bb.0:
18; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
19; AVX-NEXT:    retq
20  %1 = ashr <4 x i32> zeroinitializer, %x
21  ret <4 x i32> %1
22}
23
24; fold (sra -1, x) -> -1
25define <4 x i32> @combine_vec_ashr_allones(<4 x i32> %x) {
26; SSE-LABEL: combine_vec_ashr_allones:
27; SSE:       # %bb.0:
28; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
29; SSE-NEXT:    retq
30;
31; AVX-LABEL: combine_vec_ashr_allones:
32; AVX:       # %bb.0:
33; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
34; AVX-NEXT:    retq
35  %1 = ashr <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %x
36  ret <4 x i32> %1
37}
38
39; fold (sra x, c >= size(x)) -> undef
40define <4 x i32> @combine_vec_ashr_outofrange0(<4 x i32> %x) {
41; CHECK-LABEL: combine_vec_ashr_outofrange0:
42; CHECK:       # %bb.0:
43; CHECK-NEXT:    retq
44  %1 = ashr <4 x i32> %x, <i32 33, i32 33, i32 33, i32 33>
45  ret <4 x i32> %1
46}
47
48define <4 x i32> @combine_vec_ashr_outofrange1(<4 x i32> %x) {
49; CHECK-LABEL: combine_vec_ashr_outofrange1:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    retq
52  %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 36>
53  ret <4 x i32> %1
54}
55
56define <4 x i32> @combine_vec_ashr_outofrange2(<4 x i32> %x) {
57; CHECK-LABEL: combine_vec_ashr_outofrange2:
58; CHECK:       # %bb.0:
59; CHECK-NEXT:    retq
60  %1 = ashr <4 x i32> %x, <i32 33, i32 34, i32 35, i32 undef>
61  ret <4 x i32> %1
62}
63
64; fold (sra x, 0) -> x
65define <4 x i32> @combine_vec_ashr_by_zero(<4 x i32> %x) {
66; CHECK-LABEL: combine_vec_ashr_by_zero:
67; CHECK:       # %bb.0:
68; CHECK-NEXT:    retq
69  %1 = ashr <4 x i32> %x, zeroinitializer
70  ret <4 x i32> %1
71}
72
73; fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
74define <4 x i32> @combine_vec_ashr_ashr0(<4 x i32> %x) {
75; SSE-LABEL: combine_vec_ashr_ashr0:
76; SSE:       # %bb.0:
77; SSE-NEXT:    psrad $6, %xmm0
78; SSE-NEXT:    retq
79;
80; AVX-LABEL: combine_vec_ashr_ashr0:
81; AVX:       # %bb.0:
82; AVX-NEXT:    vpsrad $6, %xmm0, %xmm0
83; AVX-NEXT:    retq
84  %1 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
85  %2 = ashr <4 x i32> %1, <i32 4, i32 4, i32 4, i32 4>
86  ret <4 x i32> %2
87}
88
89define <4 x i32> @combine_vec_ashr_ashr1(<4 x i32> %x) {
90; SSE2-LABEL: combine_vec_ashr_ashr1:
91; SSE2:       # %bb.0:
92; SSE2-NEXT:    movdqa %xmm0, %xmm1
93; SSE2-NEXT:    psrad $10, %xmm1
94; SSE2-NEXT:    movdqa %xmm0, %xmm2
95; SSE2-NEXT:    psrad $8, %xmm2
96; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
97; SSE2-NEXT:    movdqa %xmm0, %xmm1
98; SSE2-NEXT:    psrad $6, %xmm1
99; SSE2-NEXT:    psrad $4, %xmm0
100; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
101; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
102; SSE2-NEXT:    retq
103;
104; SSE41-LABEL: combine_vec_ashr_ashr1:
105; SSE41:       # %bb.0:
106; SSE41-NEXT:    movdqa %xmm0, %xmm1
107; SSE41-NEXT:    psrad $10, %xmm1
108; SSE41-NEXT:    movdqa %xmm0, %xmm2
109; SSE41-NEXT:    psrad $6, %xmm2
110; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
111; SSE41-NEXT:    movdqa %xmm0, %xmm1
112; SSE41-NEXT:    psrad $8, %xmm1
113; SSE41-NEXT:    psrad $4, %xmm0
114; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
115; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
116; SSE41-NEXT:    retq
117;
118; AVX-LABEL: combine_vec_ashr_ashr1:
119; AVX:       # %bb.0:
120; AVX-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
121; AVX-NEXT:    retq
122  %1 = ashr <4 x i32> %x, <i32 0, i32 1, i32 2, i32 3>
123  %2 = ashr <4 x i32> %1, <i32 4, i32 5, i32 6, i32 7>
124  ret <4 x i32> %2
125}
126
127define <4 x i32> @combine_vec_ashr_ashr2(<4 x i32> %x) {
128; SSE-LABEL: combine_vec_ashr_ashr2:
129; SSE:       # %bb.0:
130; SSE-NEXT:    psrad $31, %xmm0
131; SSE-NEXT:    retq
132;
133; AVX-LABEL: combine_vec_ashr_ashr2:
134; AVX:       # %bb.0:
135; AVX-NEXT:    vpsrad $31, %xmm0, %xmm0
136; AVX-NEXT:    retq
137  %1 = ashr <4 x i32> %x, <i32 17, i32 18, i32 19, i32 20>
138  %2 = ashr <4 x i32> %1, <i32 25, i32 26, i32 27, i32 28>
139  ret <4 x i32> %2
140}
141
142define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
143; SSE2-LABEL: combine_vec_ashr_ashr3:
144; SSE2:       # %bb.0:
145; SSE2-NEXT:    movdqa %xmm0, %xmm2
146; SSE2-NEXT:    psrad $27, %xmm2
147; SSE2-NEXT:    movdqa %xmm0, %xmm1
148; SSE2-NEXT:    psrad $31, %xmm1
149; SSE2-NEXT:    movdqa %xmm1, %xmm3
150; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
151; SSE2-NEXT:    psrad $15, %xmm0
152; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
153; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
154; SSE2-NEXT:    movaps %xmm1, %xmm0
155; SSE2-NEXT:    retq
156;
157; SSE41-LABEL: combine_vec_ashr_ashr3:
158; SSE41:       # %bb.0:
159; SSE41-NEXT:    movdqa %xmm0, %xmm1
160; SSE41-NEXT:    psrad $27, %xmm1
161; SSE41-NEXT:    movdqa %xmm0, %xmm2
162; SSE41-NEXT:    psrad $15, %xmm2
163; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
164; SSE41-NEXT:    psrad $31, %xmm0
165; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
166; SSE41-NEXT:    retq
167;
168; AVX-LABEL: combine_vec_ashr_ashr3:
169; AVX:       # %bb.0:
170; AVX-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
171; AVX-NEXT:    retq
172  %1 = ashr <4 x i32> %x, <i32  1, i32  5, i32 50, i32 27>
173  %2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32  0>
174  ret <4 x i32> %2
175}
176
177; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
178define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
179; SSE2-LABEL: combine_vec_ashr_trunc_and:
180; SSE2:       # %bb.0:
181; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
182; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
183; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
184; SSE2-NEXT:    movdqa %xmm0, %xmm3
185; SSE2-NEXT:    psrad %xmm2, %xmm3
186; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
187; SSE2-NEXT:    movdqa %xmm0, %xmm2
188; SSE2-NEXT:    psrad %xmm4, %xmm2
189; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
190; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
191; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
192; SSE2-NEXT:    movdqa %xmm0, %xmm4
193; SSE2-NEXT:    psrad %xmm3, %xmm4
194; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
195; SSE2-NEXT:    psrad %xmm1, %xmm0
196; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
197; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
198; SSE2-NEXT:    movaps %xmm2, %xmm0
199; SSE2-NEXT:    retq
200;
201; SSE41-LABEL: combine_vec_ashr_trunc_and:
202; SSE41:       # %bb.0:
203; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
204; SSE41-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
205; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
206; SSE41-NEXT:    movdqa %xmm0, %xmm3
207; SSE41-NEXT:    psrad %xmm2, %xmm3
208; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
209; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
210; SSE41-NEXT:    movdqa %xmm0, %xmm5
211; SSE41-NEXT:    psrad %xmm4, %xmm5
212; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
213; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
214; SSE41-NEXT:    movdqa %xmm0, %xmm3
215; SSE41-NEXT:    psrad %xmm1, %xmm3
216; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
217; SSE41-NEXT:    psrad %xmm1, %xmm0
218; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
219; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
220; SSE41-NEXT:    retq
221;
222; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_and:
223; AVX2-SLOW:       # %bb.0:
224; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm1, %xmm2
225; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
226; AVX2-SLOW-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
227; AVX2-SLOW-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
228; AVX2-SLOW-NEXT:    vzeroupper
229; AVX2-SLOW-NEXT:    retq
230;
231; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_and:
232; AVX2-FAST-ALL:       # %bb.0:
233; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [0,2,4,6,0,0,0,0]
234; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
235; AVX2-FAST-ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
236; AVX2-FAST-ALL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
237; AVX2-FAST-ALL-NEXT:    vzeroupper
238; AVX2-FAST-ALL-NEXT:    retq
239;
240; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_and:
241; AVX2-FAST-PERLANE:       # %bb.0:
242; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm1, %xmm2
243; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
244; AVX2-FAST-PERLANE-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
245; AVX2-FAST-PERLANE-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
246; AVX2-FAST-PERLANE-NEXT:    vzeroupper
247; AVX2-FAST-PERLANE-NEXT:    retq
248;
249; AVX512-LABEL: combine_vec_ashr_trunc_and:
250; AVX512:       # %bb.0:
251; AVX512-NEXT:    vpmovqd %ymm1, %xmm1
252; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
253; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
254; AVX512-NEXT:    vzeroupper
255; AVX512-NEXT:    retq
256  %1 = and <4 x i64> %y, <i64 15, i64 255, i64 4095, i64 65535>
257  %2 = trunc <4 x i64> %1 to <4 x i32>
258  %3 = ashr <4 x i32> %x, %2
259  ret <4 x i32> %3
260}
261
262; fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
263;      if c1 is equal to the number of bits the trunc removes
264define <4 x i32> @combine_vec_ashr_trunc_lshr(<4 x i64> %x) {
265; SSE2-LABEL: combine_vec_ashr_trunc_lshr:
266; SSE2:       # %bb.0:
267; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
268; SSE2-NEXT:    movaps %xmm0, %xmm1
269; SSE2-NEXT:    psrad $3, %xmm1
270; SSE2-NEXT:    movaps %xmm0, %xmm2
271; SSE2-NEXT:    psrad $2, %xmm2
272; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
273; SSE2-NEXT:    movaps %xmm0, %xmm1
274; SSE2-NEXT:    psrad $1, %xmm1
275; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
276; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
277; SSE2-NEXT:    retq
278;
279; SSE41-LABEL: combine_vec_ashr_trunc_lshr:
280; SSE41:       # %bb.0:
281; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
282; SSE41-NEXT:    movaps %xmm0, %xmm2
283; SSE41-NEXT:    psrad $2, %xmm2
284; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
285; SSE41-NEXT:    psrad $1, %xmm0
286; SSE41-NEXT:    psrad $3, %xmm1
287; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
288; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
289; SSE41-NEXT:    retq
290;
291; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_lshr:
292; AVX2-SLOW:       # %bb.0:
293; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
294; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
295; AVX2-SLOW-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
296; AVX2-SLOW-NEXT:    vzeroupper
297; AVX2-SLOW-NEXT:    retq
298;
299; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_lshr:
300; AVX2-FAST-ALL:       # %bb.0:
301; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
302; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
303; AVX2-FAST-ALL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
304; AVX2-FAST-ALL-NEXT:    vzeroupper
305; AVX2-FAST-ALL-NEXT:    retq
306;
307; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_lshr:
308; AVX2-FAST-PERLANE:       # %bb.0:
309; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
310; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
311; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
312; AVX2-FAST-PERLANE-NEXT:    vzeroupper
313; AVX2-FAST-PERLANE-NEXT:    retq
314;
315; AVX512-LABEL: combine_vec_ashr_trunc_lshr:
316; AVX512:       # %bb.0:
317; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
318; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
319; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
320; AVX512-NEXT:    vzeroupper
321; AVX512-NEXT:    retq
322  %1 = lshr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
323  %2 = trunc <4 x i64> %1 to <4 x i32>
324  %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
325  ret <4 x i32> %3
326}
327
328define <16 x i8> @combine_vec_ashr_trunc_lshr_splat(<16 x i32> %x) {
329; SSE-LABEL: combine_vec_ashr_trunc_lshr_splat:
330; SSE:       # %bb.0:
331; SSE-NEXT:    psrad $26, %xmm3
332; SSE-NEXT:    psrad $26, %xmm2
333; SSE-NEXT:    packssdw %xmm3, %xmm2
334; SSE-NEXT:    psrad $26, %xmm1
335; SSE-NEXT:    psrad $26, %xmm0
336; SSE-NEXT:    packssdw %xmm1, %xmm0
337; SSE-NEXT:    packsswb %xmm2, %xmm0
338; SSE-NEXT:    retq
339;
340; AVX2-LABEL: combine_vec_ashr_trunc_lshr_splat:
341; AVX2:       # %bb.0:
342; AVX2-NEXT:    vpsrad $26, %ymm1, %ymm1
343; AVX2-NEXT:    vpsrad $26, %ymm0, %ymm0
344; AVX2-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0
345; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
346; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
347; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
348; AVX2-NEXT:    vzeroupper
349; AVX2-NEXT:    retq
350;
351; AVX512-LABEL: combine_vec_ashr_trunc_lshr_splat:
352; AVX512:       # %bb.0:
353; AVX512-NEXT:    vpsrad $26, %zmm0, %zmm0
354; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
355; AVX512-NEXT:    vzeroupper
356; AVX512-NEXT:    retq
357  %1 = lshr <16 x i32> %x, <i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24>
358  %2 = trunc <16 x i32> %1 to <16 x i8>
359  %3 = ashr <16 x i8> %2, <i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2, i8 2>
360  ret <16 x i8> %3
361}
362
363; fold (sra (trunc (sra x, c1)), c2) -> (trunc (sra x, c1 + c2))
364;      if c1 is equal to the number of bits the trunc removes
365define <4 x i32> @combine_vec_ashr_trunc_ashr(<4 x i64> %x) {
366; SSE2-LABEL: combine_vec_ashr_trunc_ashr:
367; SSE2:       # %bb.0:
368; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
369; SSE2-NEXT:    movaps %xmm0, %xmm1
370; SSE2-NEXT:    psrad $3, %xmm1
371; SSE2-NEXT:    movaps %xmm0, %xmm2
372; SSE2-NEXT:    psrad $2, %xmm2
373; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
374; SSE2-NEXT:    movaps %xmm0, %xmm1
375; SSE2-NEXT:    psrad $1, %xmm1
376; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
377; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
378; SSE2-NEXT:    retq
379;
380; SSE41-LABEL: combine_vec_ashr_trunc_ashr:
381; SSE41:       # %bb.0:
382; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
383; SSE41-NEXT:    movaps %xmm0, %xmm2
384; SSE41-NEXT:    psrad $2, %xmm2
385; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1,2,3],xmm2[4,5,6,7]
386; SSE41-NEXT:    psrad $1, %xmm0
387; SSE41-NEXT:    psrad $3, %xmm1
388; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
389; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3],xmm2[4,5],xmm0[6,7]
390; SSE41-NEXT:    retq
391;
392; AVX2-SLOW-LABEL: combine_vec_ashr_trunc_ashr:
393; AVX2-SLOW:       # %bb.0:
394; AVX2-SLOW-NEXT:    vextractf128 $1, %ymm0, %xmm1
395; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
396; AVX2-SLOW-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
397; AVX2-SLOW-NEXT:    vzeroupper
398; AVX2-SLOW-NEXT:    retq
399;
400; AVX2-FAST-ALL-LABEL: combine_vec_ashr_trunc_ashr:
401; AVX2-FAST-ALL:       # %bb.0:
402; AVX2-FAST-ALL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [1,3,5,7]
403; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
404; AVX2-FAST-ALL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
405; AVX2-FAST-ALL-NEXT:    vzeroupper
406; AVX2-FAST-ALL-NEXT:    retq
407;
408; AVX2-FAST-PERLANE-LABEL: combine_vec_ashr_trunc_ashr:
409; AVX2-FAST-PERLANE:       # %bb.0:
410; AVX2-FAST-PERLANE-NEXT:    vextractf128 $1, %ymm0, %xmm1
411; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
412; AVX2-FAST-PERLANE-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
413; AVX2-FAST-PERLANE-NEXT:    vzeroupper
414; AVX2-FAST-PERLANE-NEXT:    retq
415;
416; AVX512-LABEL: combine_vec_ashr_trunc_ashr:
417; AVX512:       # %bb.0:
418; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
419; AVX512-NEXT:    vpmovqd %ymm0, %xmm0
420; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
421; AVX512-NEXT:    vzeroupper
422; AVX512-NEXT:    retq
423  %1 = ashr <4 x i64> %x, <i64 32, i64 32, i64 32, i64 32>
424  %2 = trunc <4 x i64> %1 to <4 x i32>
425  %3 = ashr <4 x i32> %2, <i32 0, i32 1, i32 2, i32 3>
426  ret <4 x i32> %3
427}
428
429define <8 x i16> @combine_vec_ashr_trunc_ashr_splat(<8 x i32> %x) {
430; SSE-LABEL: combine_vec_ashr_trunc_ashr_splat:
431; SSE:       # %bb.0:
432; SSE-NEXT:    psrad $19, %xmm1
433; SSE-NEXT:    psrad $19, %xmm0
434; SSE-NEXT:    packssdw %xmm1, %xmm0
435; SSE-NEXT:    retq
436;
437; AVX2-LABEL: combine_vec_ashr_trunc_ashr_splat:
438; AVX2:       # %bb.0:
439; AVX2-NEXT:    vpsrad $19, %ymm0, %ymm0
440; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
441; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
442; AVX2-NEXT:    vzeroupper
443; AVX2-NEXT:    retq
444;
445; AVX512-LABEL: combine_vec_ashr_trunc_ashr_splat:
446; AVX512:       # %bb.0:
447; AVX512-NEXT:    vpsrad $19, %ymm0, %ymm0
448; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
449; AVX512-NEXT:    vzeroupper
450; AVX512-NEXT:    retq
451  %1 = ashr <8 x i32> %x, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
452  %2 = trunc <8 x i32> %1 to <8 x i16>
453  %3 = ashr <8 x i16> %2, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
454  ret <8 x i16> %3
455}
456
457; If the sign bit is known to be zero, switch this to a SRL.
458define <4 x i32> @combine_vec_ashr_positive(<4 x i32> %x, <4 x i32> %y) {
459; SSE2-LABEL: combine_vec_ashr_positive:
460; SSE2:       # %bb.0:
461; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
462; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
463; SSE2-NEXT:    movdqa %xmm0, %xmm3
464; SSE2-NEXT:    psrld %xmm2, %xmm3
465; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
466; SSE2-NEXT:    movdqa %xmm0, %xmm2
467; SSE2-NEXT:    psrld %xmm4, %xmm2
468; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
469; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
470; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
471; SSE2-NEXT:    movdqa %xmm0, %xmm4
472; SSE2-NEXT:    psrld %xmm3, %xmm4
473; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
474; SSE2-NEXT:    psrld %xmm1, %xmm0
475; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
476; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
477; SSE2-NEXT:    movaps %xmm2, %xmm0
478; SSE2-NEXT:    retq
479;
480; SSE41-LABEL: combine_vec_ashr_positive:
481; SSE41:       # %bb.0:
482; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
483; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
484; SSE41-NEXT:    movdqa %xmm0, %xmm3
485; SSE41-NEXT:    psrld %xmm2, %xmm3
486; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
487; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
488; SSE41-NEXT:    movdqa %xmm0, %xmm5
489; SSE41-NEXT:    psrld %xmm4, %xmm5
490; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
491; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
492; SSE41-NEXT:    movdqa %xmm0, %xmm3
493; SSE41-NEXT:    psrld %xmm1, %xmm3
494; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
495; SSE41-NEXT:    psrld %xmm1, %xmm0
496; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
497; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
498; SSE41-NEXT:    retq
499;
500; AVX-LABEL: combine_vec_ashr_positive:
501; AVX:       # %bb.0:
502; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
503; AVX-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
504; AVX-NEXT:    retq
505  %1 = and <4 x i32> %x, <i32 15, i32 255, i32 4095, i32 65535>
506  %2 = ashr <4 x i32> %1, %y
507  ret <4 x i32> %2
508}
509
510define <4 x i32> @combine_vec_ashr_positive_splat(<4 x i32> %x, <4 x i32> %y) {
511; SSE-LABEL: combine_vec_ashr_positive_splat:
512; SSE:       # %bb.0:
513; SSE-NEXT:    xorps %xmm0, %xmm0
514; SSE-NEXT:    retq
515;
516; AVX-LABEL: combine_vec_ashr_positive_splat:
517; AVX:       # %bb.0:
518; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
519; AVX-NEXT:    retq
520  %1 = and <4 x i32> %x, <i32 1023, i32 1023, i32 1023, i32 1023>
521  %2 = ashr <4 x i32> %1, <i32 10, i32 10, i32 10, i32 10>
522  ret <4 x i32> %2
523}
524
525define <8 x i16> @combine_vec8i16_ashr_clamped(<8 x i16> %x, <8 x i16> %y) {
526; SSE2-LABEL: combine_vec8i16_ashr_clamped:
527; SSE2:       # %bb.0:
528; SSE2-NEXT:    movdqa %xmm1, %xmm2
529; SSE2-NEXT:    psubusw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
530; SSE2-NEXT:    psubw %xmm2, %xmm1
531; SSE2-NEXT:    psllw $12, %xmm1
532; SSE2-NEXT:    movdqa %xmm1, %xmm2
533; SSE2-NEXT:    psraw $15, %xmm2
534; SSE2-NEXT:    movdqa %xmm2, %xmm3
535; SSE2-NEXT:    pandn %xmm0, %xmm3
536; SSE2-NEXT:    psraw $8, %xmm0
537; SSE2-NEXT:    pand %xmm2, %xmm0
538; SSE2-NEXT:    por %xmm3, %xmm0
539; SSE2-NEXT:    paddw %xmm1, %xmm1
540; SSE2-NEXT:    movdqa %xmm1, %xmm2
541; SSE2-NEXT:    psraw $15, %xmm2
542; SSE2-NEXT:    movdqa %xmm2, %xmm3
543; SSE2-NEXT:    pandn %xmm0, %xmm3
544; SSE2-NEXT:    psraw $4, %xmm0
545; SSE2-NEXT:    pand %xmm2, %xmm0
546; SSE2-NEXT:    por %xmm3, %xmm0
547; SSE2-NEXT:    paddw %xmm1, %xmm1
548; SSE2-NEXT:    movdqa %xmm1, %xmm2
549; SSE2-NEXT:    psraw $15, %xmm2
550; SSE2-NEXT:    movdqa %xmm2, %xmm3
551; SSE2-NEXT:    pandn %xmm0, %xmm3
552; SSE2-NEXT:    psraw $2, %xmm0
553; SSE2-NEXT:    pand %xmm2, %xmm0
554; SSE2-NEXT:    por %xmm3, %xmm0
555; SSE2-NEXT:    paddw %xmm1, %xmm1
556; SSE2-NEXT:    psraw $15, %xmm1
557; SSE2-NEXT:    movdqa %xmm1, %xmm2
558; SSE2-NEXT:    pandn %xmm0, %xmm2
559; SSE2-NEXT:    psraw $1, %xmm0
560; SSE2-NEXT:    pand %xmm1, %xmm0
561; SSE2-NEXT:    por %xmm2, %xmm0
562; SSE2-NEXT:    retq
563;
564; SSE41-LABEL: combine_vec8i16_ashr_clamped:
565; SSE41:       # %bb.0:
566; SSE41-NEXT:    movdqa %xmm0, %xmm2
567; SSE41-NEXT:    pminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
568; SSE41-NEXT:    movdqa %xmm1, %xmm0
569; SSE41-NEXT:    psllw $12, %xmm0
570; SSE41-NEXT:    psllw $4, %xmm1
571; SSE41-NEXT:    por %xmm1, %xmm0
572; SSE41-NEXT:    movdqa %xmm0, %xmm1
573; SSE41-NEXT:    paddw %xmm0, %xmm1
574; SSE41-NEXT:    movdqa %xmm2, %xmm3
575; SSE41-NEXT:    psraw $8, %xmm3
576; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
577; SSE41-NEXT:    movdqa %xmm2, %xmm3
578; SSE41-NEXT:    psraw $4, %xmm3
579; SSE41-NEXT:    movdqa %xmm1, %xmm0
580; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
581; SSE41-NEXT:    movdqa %xmm2, %xmm3
582; SSE41-NEXT:    psraw $2, %xmm3
583; SSE41-NEXT:    paddw %xmm1, %xmm1
584; SSE41-NEXT:    movdqa %xmm1, %xmm0
585; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
586; SSE41-NEXT:    movdqa %xmm2, %xmm3
587; SSE41-NEXT:    psraw $1, %xmm3
588; SSE41-NEXT:    paddw %xmm1, %xmm1
589; SSE41-NEXT:    movdqa %xmm1, %xmm0
590; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
591; SSE41-NEXT:    movdqa %xmm2, %xmm0
592; SSE41-NEXT:    retq
593;
594; AVX2-LABEL: combine_vec8i16_ashr_clamped:
595; AVX2:       # %bb.0:
596; AVX2-NEXT:    vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
597; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
598; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
599; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
600; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
601; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
602; AVX2-NEXT:    vzeroupper
603; AVX2-NEXT:    retq
604;
605; AVX512-LABEL: combine_vec8i16_ashr_clamped:
606; AVX512:       # %bb.0:
607; AVX512-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
608; AVX512-NEXT:    retq
609  %1 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %y, <8 x i16> <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>)
610  %2 = ashr <8 x i16> %x, %1
611  ret <8 x i16> %2
612}
613
614define <4 x i32> @combine_vec4i32_ashr_clamped(<4 x i32> %x, <4 x i32> %y) {
615; SSE2-LABEL: combine_vec4i32_ashr_clamped:
616; SSE2:       # %bb.0:
617; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
618; SSE2-NEXT:    pxor %xmm1, %xmm2
619; SSE2-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
620; SSE2-NEXT:    movdqa %xmm2, %xmm3
621; SSE2-NEXT:    pandn %xmm1, %xmm3
622; SSE2-NEXT:    psrld $27, %xmm2
623; SSE2-NEXT:    por %xmm3, %xmm2
624; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[2,3,3,3,4,5,6,7]
625; SSE2-NEXT:    movdqa %xmm0, %xmm3
626; SSE2-NEXT:    psrad %xmm1, %xmm3
627; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[0,1,1,1,4,5,6,7]
628; SSE2-NEXT:    movdqa %xmm0, %xmm1
629; SSE2-NEXT:    psrad %xmm4, %xmm1
630; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
631; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
632; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[2,3,3,3,4,5,6,7]
633; SSE2-NEXT:    movdqa %xmm0, %xmm4
634; SSE2-NEXT:    psrad %xmm3, %xmm4
635; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,1,4,5,6,7]
636; SSE2-NEXT:    psrad %xmm2, %xmm0
637; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
638; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm0[0,3]
639; SSE2-NEXT:    movaps %xmm1, %xmm0
640; SSE2-NEXT:    retq
641;
642; SSE41-LABEL: combine_vec4i32_ashr_clamped:
643; SSE41:       # %bb.0:
644; SSE41-NEXT:    pminud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
645; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
646; SSE41-NEXT:    movdqa %xmm0, %xmm3
647; SSE41-NEXT:    psrad %xmm2, %xmm3
648; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
649; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
650; SSE41-NEXT:    movdqa %xmm0, %xmm5
651; SSE41-NEXT:    psrad %xmm4, %xmm5
652; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
653; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
654; SSE41-NEXT:    movdqa %xmm0, %xmm3
655; SSE41-NEXT:    psrad %xmm1, %xmm3
656; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
657; SSE41-NEXT:    psrad %xmm1, %xmm0
658; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
659; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
660; SSE41-NEXT:    retq
661;
662; AVX-LABEL: combine_vec4i32_ashr_clamped:
663; AVX:       # %bb.0:
664; AVX-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
665; AVX-NEXT:    retq
666  %1 = tail call <4 x i32> @llvm.umin.v4i32(<4 x i32> %y, <4 x i32> <i32 31, i32 31, i32 31, i32 31>)
667  %2 = ashr <4 x i32> %x, %1
668  ret <4 x i32> %2
669}
670
671define <4 x i64> @combine_vec4i64_ashr_clamped(<4 x i64> %x, <4 x i64> %y) {
672; SSE2-LABEL: combine_vec4i64_ashr_clamped:
673; SSE2:       # %bb.0:
674; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [9223372039002259456,9223372039002259456]
675; SSE2-NEXT:    movdqa %xmm3, %xmm4
676; SSE2-NEXT:    pxor %xmm5, %xmm4
677; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,0,2,2]
678; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [2147483711,2147483711,2147483711,2147483711]
679; SSE2-NEXT:    movdqa %xmm7, %xmm8
680; SSE2-NEXT:    pcmpgtd %xmm6, %xmm8
681; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
682; SSE2-NEXT:    pcmpeqd %xmm5, %xmm4
683; SSE2-NEXT:    pand %xmm8, %xmm4
684; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [63,63]
685; SSE2-NEXT:    pand %xmm4, %xmm3
686; SSE2-NEXT:    pandn %xmm6, %xmm4
687; SSE2-NEXT:    por %xmm3, %xmm4
688; SSE2-NEXT:    movdqa %xmm2, %xmm3
689; SSE2-NEXT:    pxor %xmm5, %xmm3
690; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[0,0,2,2]
691; SSE2-NEXT:    pcmpgtd %xmm8, %xmm7
692; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
693; SSE2-NEXT:    pcmpeqd %xmm5, %xmm3
694; SSE2-NEXT:    pand %xmm7, %xmm3
695; SSE2-NEXT:    pand %xmm3, %xmm2
696; SSE2-NEXT:    pandn %xmm6, %xmm3
697; SSE2-NEXT:    por %xmm2, %xmm3
698; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
699; SSE2-NEXT:    movdqa %xmm2, %xmm5
700; SSE2-NEXT:    psrlq %xmm3, %xmm5
701; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
702; SSE2-NEXT:    movdqa %xmm2, %xmm7
703; SSE2-NEXT:    psrlq %xmm6, %xmm7
704; SSE2-NEXT:    movsd {{.*#+}} xmm7 = xmm5[0],xmm7[1]
705; SSE2-NEXT:    movdqa %xmm0, %xmm5
706; SSE2-NEXT:    psrlq %xmm3, %xmm5
707; SSE2-NEXT:    psrlq %xmm6, %xmm0
708; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm5[0],xmm0[1]
709; SSE2-NEXT:    xorpd %xmm7, %xmm0
710; SSE2-NEXT:    psubq %xmm7, %xmm0
711; SSE2-NEXT:    movdqa %xmm2, %xmm3
712; SSE2-NEXT:    psrlq %xmm4, %xmm3
713; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
714; SSE2-NEXT:    psrlq %xmm5, %xmm2
715; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
716; SSE2-NEXT:    movdqa %xmm1, %xmm3
717; SSE2-NEXT:    psrlq %xmm4, %xmm3
718; SSE2-NEXT:    psrlq %xmm5, %xmm1
719; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
720; SSE2-NEXT:    xorpd %xmm2, %xmm1
721; SSE2-NEXT:    psubq %xmm2, %xmm1
722; SSE2-NEXT:    retq
723;
724; SSE41-LABEL: combine_vec4i64_ashr_clamped:
725; SSE41:       # %bb.0:
726; SSE41-NEXT:    movdqa %xmm0, %xmm4
727; SSE41-NEXT:    movdqa {{.*#+}} xmm7 = [9223372039002259456,9223372039002259456]
728; SSE41-NEXT:    movdqa %xmm3, %xmm6
729; SSE41-NEXT:    pxor %xmm7, %xmm6
730; SSE41-NEXT:    movdqa {{.*#+}} xmm8 = [9223372039002259519,9223372039002259519]
731; SSE41-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[0,0,2,2]
732; SSE41-NEXT:    pcmpeqd %xmm8, %xmm6
733; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [2147483711,2147483711,2147483711,2147483711]
734; SSE41-NEXT:    movdqa %xmm5, %xmm0
735; SSE41-NEXT:    pcmpgtd %xmm9, %xmm0
736; SSE41-NEXT:    pand %xmm6, %xmm0
737; SSE41-NEXT:    movapd {{.*#+}} xmm9 = [63,63]
738; SSE41-NEXT:    movapd %xmm9, %xmm6
739; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm6
740; SSE41-NEXT:    pxor %xmm2, %xmm7
741; SSE41-NEXT:    pcmpeqd %xmm7, %xmm8
742; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,0,2,2]
743; SSE41-NEXT:    pcmpgtd %xmm0, %xmm5
744; SSE41-NEXT:    pand %xmm8, %xmm5
745; SSE41-NEXT:    movdqa %xmm5, %xmm0
746; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm9
747; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
748; SSE41-NEXT:    movdqa %xmm0, %xmm2
749; SSE41-NEXT:    psrlq %xmm9, %xmm2
750; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[2,3,2,3]
751; SSE41-NEXT:    movdqa %xmm0, %xmm5
752; SSE41-NEXT:    psrlq %xmm3, %xmm5
753; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm2[0,1,2,3],xmm5[4,5,6,7]
754; SSE41-NEXT:    movdqa %xmm4, %xmm2
755; SSE41-NEXT:    psrlq %xmm9, %xmm2
756; SSE41-NEXT:    psrlq %xmm3, %xmm4
757; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm2[0,1,2,3],xmm4[4,5,6,7]
758; SSE41-NEXT:    pxor %xmm5, %xmm4
759; SSE41-NEXT:    psubq %xmm5, %xmm4
760; SSE41-NEXT:    movdqa %xmm0, %xmm2
761; SSE41-NEXT:    psrlq %xmm6, %xmm2
762; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm6[2,3,2,3]
763; SSE41-NEXT:    psrlq %xmm3, %xmm0
764; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
765; SSE41-NEXT:    movdqa %xmm1, %xmm2
766; SSE41-NEXT:    psrlq %xmm6, %xmm2
767; SSE41-NEXT:    psrlq %xmm3, %xmm1
768; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
769; SSE41-NEXT:    pxor %xmm0, %xmm1
770; SSE41-NEXT:    psubq %xmm0, %xmm1
771; SSE41-NEXT:    movdqa %xmm4, %xmm0
772; SSE41-NEXT:    retq
773;
774; AVX2-LABEL: combine_vec4i64_ashr_clamped:
775; AVX2:       # %bb.0:
776; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
777; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm3
778; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [9223372036854775870,9223372036854775870,9223372036854775870,9223372036854775870]
779; AVX2-NEXT:    vpcmpgtq %ymm4, %ymm3, %ymm3
780; AVX2-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [63,63,63,63]
781; AVX2-NEXT:    vblendvpd %ymm3, %ymm4, %ymm1, %ymm1
782; AVX2-NEXT:    vpsrlvq %ymm1, %ymm2, %ymm2
783; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
784; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
785; AVX2-NEXT:    vpsubq %ymm2, %ymm0, %ymm0
786; AVX2-NEXT:    retq
787;
788; AVX512-LABEL: combine_vec4i64_ashr_clamped:
789; AVX512:       # %bb.0:
790; AVX512-NEXT:    vpsravq %ymm1, %ymm0, %ymm0
791; AVX512-NEXT:    retq
792  %1 = tail call <4 x i64> @llvm.umin.v4i64(<4 x i64> %y, <4 x i64> <i64 63, i64 63, i64 63, i64 63>)
793  %2 = ashr <4 x i64> %x, %1
794  ret <4 x i64> %2
795}
796