xref: /llvm-project/llvm/test/CodeGen/X86/rotate-extract-vector.ll (revision 74fe1da01eb149a2234fc0f9570c84a08692e782)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X86
3; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=CHECK,X64
4
5; Check that under certain conditions we can factor out a rotate
6; from the following idioms:
7;   (a*c0) >> s1 | (a*c1)
8;   (a/c0) << s1 | (a/c1)
9; This targets cases where instcombine has folded a shl/srl/mul/udiv
10; with one of the shifts from the rotate idiom
11
12define <4 x i32> @vroll_v4i32_extract_shl(<4 x i32> %i) {
13; CHECK-LABEL: vroll_v4i32_extract_shl:
14; CHECK:       # %bb.0:
15; CHECK-NEXT:    vpslld $3, %xmm0, %xmm0
16; CHECK-NEXT:    vprold $7, %xmm0, %xmm0
17; CHECK-NEXT:    ret{{[l|q]}}
18  %lhs_mul = shl <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
19  %rhs_mul = shl <4 x i32> %i, <i32 10, i32 10, i32 10, i32 10>
20  %lhs_shift = lshr <4 x i32> %lhs_mul, <i32 25, i32 25, i32 25, i32 25>
21  %out = or <4 x i32> %lhs_shift, %rhs_mul
22  ret <4 x i32> %out
23}
24
25define <4 x i64> @vrolq_v4i64_extract_shrl(<4 x i64> %i) nounwind {
26; CHECK-LABEL: vrolq_v4i64_extract_shrl:
27; CHECK:       # %bb.0:
28; CHECK-NEXT:    vpsrlq $5, %ymm0, %ymm0
29; CHECK-NEXT:    vprolq $29, %ymm0, %ymm0
30; CHECK-NEXT:    ret{{[l|q]}}
31  %lhs_div = lshr <4 x i64> %i, <i64 40, i64 40, i64 40, i64 40>
32  %rhs_div = lshr <4 x i64> %i, <i64 5, i64 5, i64 5, i64 5>
33  %rhs_shift = shl <4 x i64> %rhs_div, <i64 29, i64 29, i64 29, i64 29>
34  %out = or <4 x i64> %lhs_div, %rhs_shift
35  ret <4 x i64> %out
36}
37
38define <8 x i32> @vroll_extract_mul(<8 x i32> %i) nounwind {
39; X86-LABEL: vroll_extract_mul:
40; X86:       # %bb.0:
41; X86-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
42; X86-NEXT:    vprold $6, %ymm0, %ymm0
43; X86-NEXT:    retl
44;
45; X64-LABEL: vroll_extract_mul:
46; X64:       # %bb.0:
47; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
48; X64-NEXT:    vprold $6, %ymm0, %ymm0
49; X64-NEXT:    retq
50  %lhs_mul = mul <8 x i32> %i, <i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640, i32 640>
51  %rhs_mul = mul <8 x i32> %i, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
52  %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26, i32 26>
53  %out = or <8 x i32> %lhs_mul, %rhs_shift
54  ret <8 x i32> %out
55}
56
57define <2 x i64> @vrolq_extract_udiv(<2 x i64> %i) nounwind {
58; X86-LABEL: vrolq_extract_udiv:
59; X86:       # %bb.0:
60; X86-NEXT:    subl $32, %esp
61; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
62; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
63; X86-NEXT:    vmovss %xmm0, (%esp)
64; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
65; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
66; X86-NEXT:    calll __udivdi3
67; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
68; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
69; X86-NEXT:    vextractps $2, %xmm0, (%esp)
70; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
71; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
72; X86-NEXT:    vmovd %eax, %xmm0
73; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
74; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
75; X86-NEXT:    calll __udivdi3
76; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
77; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
78; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
79; X86-NEXT:    vprolq $57, %xmm0, %xmm0
80; X86-NEXT:    addl $32, %esp
81; X86-NEXT:    retl
82;
83; X64-LABEL: vrolq_extract_udiv:
84; X64:       # %bb.0:
85; X64-NEXT:    vpextrq $1, %xmm0, %rax
86; X64-NEXT:    movabsq $-6148914691236517205, %rcx # imm = 0xAAAAAAAAAAAAAAAB
87; X64-NEXT:    mulq %rcx
88; X64-NEXT:    vmovq %rdx, %xmm1
89; X64-NEXT:    vmovq %xmm0, %rax
90; X64-NEXT:    mulq %rcx
91; X64-NEXT:    vmovq %rdx, %xmm0
92; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
93; X64-NEXT:    vpsrlq $1, %xmm0, %xmm0
94; X64-NEXT:    vprolq $57, %xmm0, %xmm0
95; X64-NEXT:    retq
96  %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
97  %rhs_div = udiv <2 x i64> %i, <i64 384, i64 384>
98  %lhs_shift = shl <2 x i64> %lhs_div, <i64 57, i64 57>
99  %out = or <2 x i64> %lhs_shift, %rhs_div
100  ret <2 x i64> %out
101}
102
103define <4 x i32> @vrolw_extract_mul_with_mask(<4 x i32> %i) nounwind {
104; X86-LABEL: vrolw_extract_mul_with_mask:
105; X86:       # %bb.0:
106; X86-NEXT:    vpslld $3, %xmm0, %xmm1
107; X86-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
108; X86-NEXT:    vprold $7, %xmm0, %xmm0
109; X86-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
110; X86-NEXT:    retl
111;
112; X64-LABEL: vrolw_extract_mul_with_mask:
113; X64:       # %bb.0:
114; X64-NEXT:    vpslld $3, %xmm0, %xmm1
115; X64-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
116; X64-NEXT:    vprold $7, %xmm0, %xmm0
117; X64-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
118; X64-NEXT:    retq
119  %lhs_mul = mul <4 x i32> %i, <i32 1152, i32 1152, i32 1152, i32 1152>
120  %rhs_mul = mul <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
121  %lhs_and = and <4 x i32> %lhs_mul, <i32 160, i32 160, i32 160, i32 160>
122  %rhs_shift = lshr <4 x i32> %rhs_mul, <i32 25, i32 25, i32 25, i32 25>
123  %out = or <4 x i32> %lhs_and, %rhs_shift
124  ret <4 x i32> %out
125}
126
127define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {
128; X86-LABEL: illegal_no_extract_mul:
129; X86:       # %bb.0:
130; X86-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %zmm0, %zmm0 # [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
131; X86-NEXT:    vpsrlw $10, %zmm0, %zmm1
132; X86-NEXT:    vpsllw $6, %zmm0, %zmm0
133; X86-NEXT:    vporq %zmm1, %zmm0, %zmm0
134; X86-NEXT:    retl
135;
136; X64-LABEL: illegal_no_extract_mul:
137; X64:       # %bb.0:
138; X64-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10,10]
139; X64-NEXT:    vpsrlw $10, %zmm0, %zmm1
140; X64-NEXT:    vpsllw $6, %zmm0, %zmm0
141; X64-NEXT:    vporq %zmm1, %zmm0, %zmm0
142; X64-NEXT:    retq
143  %lhs_mul = mul <32 x i16> %i, <i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640, i16 640>
144  %rhs_mul = mul <32 x i16> %i, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
145  %rhs_shift = lshr <32 x i16> %rhs_mul, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
146  %out = or <32 x i16> %lhs_mul, %rhs_shift
147  ret <32 x i16> %out
148}
149
150; Result would undershift
151define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
152; X86-LABEL: no_extract_shl:
153; X86:       # %bb.0:
154; X86-NEXT:    vpsllq $24, %ymm0, %ymm1
155; X86-NEXT:    vpsrlq $39, %ymm0, %ymm0
156; X86-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %ymm1, %ymm0
157; X86-NEXT:    retl
158;
159; X64-LABEL: no_extract_shl:
160; X64:       # %bb.0:
161; X64-NEXT:    vpsllq $24, %ymm0, %ymm1
162; X64-NEXT:    vpsrlq $39, %ymm0, %ymm0
163; X64-NEXT:    vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
164; X64-NEXT:    retq
165  %lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
166  %rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
167  %lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
168  %out = or <4 x i64> %lhs_shift, %rhs_mul
169  ret <4 x i64> %out
170}
171
172; Result would overshift
173define <4 x i32> @no_extract_shrl(<4 x i32> %i) nounwind {
174; X86-LABEL: no_extract_shrl:
175; X86:       # %bb.0:
176; X86-NEXT:    vpsrld $9, %xmm0, %xmm1
177; X86-NEXT:    vpslld $25, %xmm0, %xmm0
178; X86-NEXT:    vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm1, %xmm0
179; X86-NEXT:    retl
180;
181; X64-LABEL: no_extract_shrl:
182; X64:       # %bb.0:
183; X64-NEXT:    vpsrld $9, %xmm0, %xmm1
184; X64-NEXT:    vpslld $25, %xmm0, %xmm0
185; X64-NEXT:    vpternlogd $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
186; X64-NEXT:    retq
187  %lhs_div = lshr <4 x i32> %i, <i32 3, i32 3, i32 3, i32 3>
188  %rhs_div = lshr <4 x i32> %i, <i32 9, i32 9, i32 9, i32 9>
189  %lhs_shift = shl <4 x i32> %lhs_div, <i32 28, i32 28, i32 28, i32 28>
190  %out = or <4 x i32> %lhs_shift, %rhs_div
191  ret <4 x i32> %out
192}
193
194; Can factor 512 from 1536, but result is 3 instead of 9
195define <8 x i32> @no_extract_mul(<8 x i32> %i) nounwind {
196; X86-LABEL: no_extract_mul:
197; X86:       # %bb.0:
198; X86-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm1
199; X86-NEXT:    vpslld $3, %ymm0, %ymm2
200; X86-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
201; X86-NEXT:    vpsrld $23, %ymm0, %ymm0
202; X86-NEXT:    vpor %ymm0, %ymm1, %ymm0
203; X86-NEXT:    retl
204;
205; X64-LABEL: no_extract_mul:
206; X64:       # %bb.0:
207; X64-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm1
208; X64-NEXT:    vpslld $3, %ymm0, %ymm2
209; X64-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
210; X64-NEXT:    vpsrld $23, %ymm0, %ymm0
211; X64-NEXT:    vpor %ymm0, %ymm1, %ymm0
212; X64-NEXT:    retq
213  %lhs_mul = mul <8 x i32> %i, <i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536, i32 1536>
214  %rhs_mul = mul <8 x i32> %i, <i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9, i32 9>
215  %rhs_shift = lshr <8 x i32> %rhs_mul, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
216  %out = or <8 x i32> %lhs_mul, %rhs_shift
217  ret <8 x i32> %out
218}
219
220; Can't evenly factor 256 from 770
221define <2 x i64> @no_extract_udiv(<2 x i64> %i) nounwind {
222; X86-LABEL: no_extract_udiv:
223; X86:       # %bb.0:
224; X86-NEXT:    subl $48, %esp
225; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
226; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
227; X86-NEXT:    vmovss %xmm0, (%esp)
228; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
229; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
230; X86-NEXT:    calll __udivdi3
231; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
232; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
233; X86-NEXT:    vextractps $2, %xmm0, (%esp)
234; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
235; X86-NEXT:    movl $3, {{[0-9]+}}(%esp)
236; X86-NEXT:    vmovd %eax, %xmm0
237; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
238; X86-NEXT:    calll __udivdi3
239; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
240; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
241; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
242; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
243; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
244; X86-NEXT:    vmovss %xmm0, (%esp)
245; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
246; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
247; X86-NEXT:    calll __udivdi3
248; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
249; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
250; X86-NEXT:    vextractps $2, %xmm0, (%esp)
251; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
252; X86-NEXT:    movl $770, {{[0-9]+}}(%esp) # imm = 0x302
253; X86-NEXT:    vmovd %eax, %xmm0
254; X86-NEXT:    vpinsrd $1, %edx, %xmm0, %xmm0
255; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
256; X86-NEXT:    calll __udivdi3
257; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
258; X86-NEXT:    vpinsrd $2, %eax, %xmm0, %xmm0
259; X86-NEXT:    vpinsrd $3, %edx, %xmm0, %xmm0
260; X86-NEXT:    vpsllq $56, {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Folded Reload
261; X86-NEXT:    vpor %xmm0, %xmm1, %xmm0
262; X86-NEXT:    addl $48, %esp
263; X86-NEXT:    retl
264;
265; X64-LABEL: no_extract_udiv:
266; X64:       # %bb.0:
267; X64-NEXT:    vpextrq $1, %xmm0, %rcx
268; X64-NEXT:    movabsq $-6148914691236517205, %rdi # imm = 0xAAAAAAAAAAAAAAAB
269; X64-NEXT:    movq %rcx, %rax
270; X64-NEXT:    mulq %rdi
271; X64-NEXT:    vmovq %rdx, %xmm1
272; X64-NEXT:    vmovq %xmm0, %rsi
273; X64-NEXT:    movq %rsi, %rax
274; X64-NEXT:    mulq %rdi
275; X64-NEXT:    vmovq %rdx, %xmm0
276; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
277; X64-NEXT:    vpsrlq $1, %xmm0, %xmm0
278; X64-NEXT:    movabsq $-6180857105216966645, %rdi # imm = 0xAA392F35DC17F00B
279; X64-NEXT:    movq %rcx, %rax
280; X64-NEXT:    mulq %rdi
281; X64-NEXT:    vmovq %rdx, %xmm1
282; X64-NEXT:    movq %rsi, %rax
283; X64-NEXT:    mulq %rdi
284; X64-NEXT:    vmovq %rdx, %xmm2
285; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
286; X64-NEXT:    vpsrlq $9, %xmm1, %xmm1
287; X64-NEXT:    vpsllq $56, %xmm0, %xmm0
288; X64-NEXT:    vpor %xmm1, %xmm0, %xmm0
289; X64-NEXT:    retq
290  %lhs_div = udiv <2 x i64> %i, <i64 3, i64 3>
291  %rhs_div = udiv <2 x i64> %i, <i64 770, i64 770>
292  %lhs_shift = shl <2 x i64> %lhs_div, <i64 56, i64 56>
293  %out = or <2 x i64> %lhs_shift, %rhs_div
294  ret <2 x i64> %out
295}
296
297; DAGCombiner transforms shl X, 1 into add X, X.
298define <4 x i32> @extract_add_1(<4 x i32> %i) nounwind {
299; CHECK-LABEL: extract_add_1:
300; CHECK:       # %bb.0:
301; CHECK-NEXT:    vprold $1, %xmm0, %xmm0
302; CHECK-NEXT:    ret{{[l|q]}}
303  %ii = add <4 x i32> %i, %i
304  %rhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
305  %out = or <4 x i32> %ii, %rhs
306  ret <4 x i32> %out
307}
308
309define <4 x i32> @extract_add_1_comut(<4 x i32> %i) nounwind {
310; CHECK-LABEL: extract_add_1_comut:
311; CHECK:       # %bb.0:
312; CHECK-NEXT:    vprold $1, %xmm0, %xmm0
313; CHECK-NEXT:    ret{{[l|q]}}
314  %ii = add <4 x i32> %i, %i
315  %lhs = lshr <4 x i32> %i, <i32 31, i32 31, i32 31, i32 31>
316  %out = or <4 x i32> %lhs, %ii
317  ret <4 x i32> %out
318}
319
320define <4 x i32> @no_extract_add_1(<4 x i32> %i) nounwind {
321; CHECK-LABEL: no_extract_add_1:
322; CHECK:       # %bb.0:
323; CHECK-NEXT:    vpaddd %xmm0, %xmm0, %xmm1
324; CHECK-NEXT:    vpsrld $27, %xmm0, %xmm0
325; CHECK-NEXT:    vpor %xmm0, %xmm1, %xmm0
326; CHECK-NEXT:    ret{{[l|q]}}
327  %ii = add <4 x i32> %i, %i
328  %rhs = lshr <4 x i32> %i, <i32 27, i32 27, i32 27, i32 27>
329  %out = or <4 x i32> %ii, %rhs
330  ret <4 x i32> %out
331}
332