xref: /llvm-project/llvm/test/CodeGen/X86/rotate_vec.ll (revision 8ae2a18736c15e0d0d9d0893b21bce4f3bf581c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver2 | FileCheck %s --check-prefixes=CHECK,XOP,XOPAVX1
3; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=bdver4 | FileCheck %s --check-prefixes=CHECK,XOP,XOPAVX2
4; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=CHECK,AVX512
5
6define <4 x i32> @rot_v4i32_splat(<4 x i32> %x) {
7; XOP-LABEL: rot_v4i32_splat:
8; XOP:       # %bb.0:
9; XOP-NEXT:    vprotd $31, %xmm0, %xmm0
10; XOP-NEXT:    retq
11;
12; AVX512-LABEL: rot_v4i32_splat:
13; AVX512:       # %bb.0:
14; AVX512-NEXT:    vprold $31, %xmm0, %xmm0
15; AVX512-NEXT:    retq
16  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
17  %2 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
18  %3 = or <4 x i32> %1, %2
19  ret <4 x i32> %3
20}
21
22define <4 x i32> @rot_v4i32_non_splat(<4 x i32> %x) {
23; XOP-LABEL: rot_v4i32_non_splat:
24; XOP:       # %bb.0:
25; XOP-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
26; XOP-NEXT:    retq
27;
28; AVX512-LABEL: rot_v4i32_non_splat:
29; AVX512:       # %bb.0:
30; AVX512-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
31; AVX512-NEXT:    retq
32  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
33  %2 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
34  %3 = or <4 x i32> %1, %2
35  ret <4 x i32> %3
36}
37
38define <4 x i32> @rot_v4i32_splat_2masks(<4 x i32> %x) {
39; XOP-LABEL: rot_v4i32_splat_2masks:
40; XOP:       # %bb.0:
41; XOP-NEXT:    vprotd $31, %xmm0, %xmm0
42; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
43; XOP-NEXT:    retq
44;
45; AVX512-LABEL: rot_v4i32_splat_2masks:
46; AVX512:       # %bb.0:
47; AVX512-NEXT:    vprold $31, %xmm0, %xmm0
48; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
49; AVX512-NEXT:    retq
50  %1 = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
51  %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
52
53  %3 = shl <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
54  %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760>
55  %5 = or <4 x i32> %2, %4
56  ret <4 x i32> %5
57}
58
59define <4 x i32> @rot_v4i32_non_splat_2masks(<4 x i32> %x) {
60; XOP-LABEL: rot_v4i32_non_splat_2masks:
61; XOP:       # %bb.0:
62; XOP-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
63; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
64; XOP-NEXT:    retq
65;
66; AVX512-LABEL: rot_v4i32_non_splat_2masks:
67; AVX512:       # %bb.0:
68; AVX512-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
69; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
70; AVX512-NEXT:    retq
71  %1 = lshr <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
72  %2 = and <4 x i32> %1, <i32 4294901760, i32 4294901760, i32 4294901760, i32 4294901760>
73
74  %3 = shl <4 x i32> %x, <i32 31, i32 30, i32 29, i32 28>
75  %4 = and <4 x i32> %3, <i32 0, i32 4294901760, i32 0, i32 4294901760>
76  %5 = or <4 x i32> %2, %4
77  ret <4 x i32> %5
78}
79
80define <4 x i32> @rot_v4i32_zero_non_splat(<4 x i32> %x) {
81; XOPAVX1-LABEL: rot_v4i32_zero_non_splat:
82; XOPAVX1:       # %bb.0:
83; XOPAVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
84; XOPAVX1-NEXT:    retq
85;
86; XOPAVX2-LABEL: rot_v4i32_zero_non_splat:
87; XOPAVX2:       # %bb.0:
88; XOPAVX2-NEXT:    vbroadcastss %xmm0, %xmm0
89; XOPAVX2-NEXT:    retq
90;
91; AVX512-LABEL: rot_v4i32_zero_non_splat:
92; AVX512:       # %bb.0:
93; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
94; AVX512-NEXT:    retq
95  %1 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 3>)
96  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
97  ret <4 x i32> %2
98}
99
100define <4 x i32> @rot_v4i32_allsignbits(<4 x i32> %x, <4 x i32> %y) {
101; CHECK-LABEL: rot_v4i32_allsignbits:
102; CHECK:       # %bb.0:
103; CHECK-NEXT:    vpsrad $31, %xmm0, %xmm0
104; CHECK-NEXT:    retq
105  %1 = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
106  %2 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %1, <4 x i32> %1, <4 x i32> %y)
107  ret <4 x i32> %2
108}
109
110define <4 x i32> @rot_v4i32_mask_ashr0(<4 x i32> %a0) {
111; XOPAVX1-LABEL: rot_v4i32_mask_ashr0:
112; XOPAVX1:       # %bb.0:
113; XOPAVX1-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
114; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
115; XOPAVX1-NEXT:    retq
116;
117; XOPAVX2-LABEL: rot_v4i32_mask_ashr0:
118; XOPAVX2:       # %bb.0:
119; XOPAVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
120; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
121; XOPAVX2-NEXT:    retq
122;
123; AVX512-LABEL: rot_v4i32_mask_ashr0:
124; AVX512:       # %bb.0:
125; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
126; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
127; AVX512-NEXT:    retq
128  %1 = ashr <4 x i32> %a0, <i32 25, i32 26, i32 27, i32 28>
129  %2 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
130  %3 = ashr <4 x i32> %2, <i32 1, i32 2, i32 3, i32 4>
131  %4 = and <4 x i32> %3, <i32 -32768, i32 -65536, i32 -32768, i32 -65536>
132  ret <4 x i32> %4
133}
134
135define <4 x i32> @rot_v4i32_mask_ashr1(<4 x i32> %a0) {
136; XOPAVX1-LABEL: rot_v4i32_mask_ashr1:
137; XOPAVX1:       # %bb.0:
138; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
139; XOPAVX1-NEXT:    vpsrad $25, %xmm0, %xmm0
140; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
141; XOPAVX1-NEXT:    retq
142;
143; XOPAVX2-LABEL: rot_v4i32_mask_ashr1:
144; XOPAVX2:       # %bb.0:
145; XOPAVX2-NEXT:    vpsrad $25, %xmm0, %xmm0
146; XOPAVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
147; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
148; XOPAVX2-NEXT:    retq
149;
150; AVX512-LABEL: rot_v4i32_mask_ashr1:
151; AVX512:       # %bb.0:
152; AVX512-NEXT:    vpsrad $25, %xmm0, %xmm0
153; AVX512-NEXT:    vpbroadcastd %xmm0, %xmm0
154; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
155; AVX512-NEXT:    retq
156  %1 = ashr <4 x i32> %a0, <i32 25, i32 26, i32 27, i32 28>
157  %2 = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %1, <4 x i32> %1, <4 x i32> <i32 1, i32 2, i32 3, i32 4>)
158  %3 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
159  %4 = ashr <4 x i32> %3, <i32 1, i32 2, i32 3, i32 4>
160  %5 = and <4 x i32> %4, <i32 -4096, i32 -8192, i32 -4096, i32 -8192>
161  ret <4 x i32> %5
162}
163
164define <8 x i16> @or_fshl_v8i16(<8 x i16> %x, <8 x i16> %y) {
165; XOP-LABEL: or_fshl_v8i16:
166; XOP:       # %bb.0:
167; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm1
168; XOP-NEXT:    vpsrlw $11, %xmm0, %xmm0
169; XOP-NEXT:    vpsllw $5, %xmm1, %xmm1
170; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
171; XOP-NEXT:    retq
172;
173; AVX512-LABEL: or_fshl_v8i16:
174; AVX512:       # %bb.0:
175; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm1
176; AVX512-NEXT:    vpsllw $5, %xmm1, %xmm1
177; AVX512-NEXT:    vpsrlw $11, %xmm0, %xmm0
178; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
179; AVX512-NEXT:    retq
180  %or1 = or <8 x i16> %y, %x
181  %sh1 = shl <8 x i16> %or1, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
182  %sh2 = lshr <8 x i16> %x, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
183  %r = or <8 x i16> %sh2, %sh1
184  ret <8 x i16> %r
185}
186
187define <4 x i32> @or_fshl_v4i32(<4 x i32> %x, <4 x i32> %y) {
188; XOP-LABEL: or_fshl_v4i32:
189; XOP:       # %bb.0:
190; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm1
191; XOP-NEXT:    vpsrld $11, %xmm0, %xmm0
192; XOP-NEXT:    vpslld $21, %xmm1, %xmm1
193; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
194; XOP-NEXT:    retq
195;
196; AVX512-LABEL: or_fshl_v4i32:
197; AVX512:       # %bb.0:
198; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm1
199; AVX512-NEXT:    vpslld $21, %xmm1, %xmm1
200; AVX512-NEXT:    vpsrld $11, %xmm0, %xmm0
201; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
202; AVX512-NEXT:    retq
203  %or1 = or <4 x i32> %y, %x
204  %sh1 = shl <4 x i32> %or1, <i32 21, i32 21, i32 21, i32 21>
205  %sh2 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
206  %r = or <4 x i32> %sh2, %sh1
207  ret <4 x i32> %r
208}
209
210define <2 x i64> @or_fshr_v2i64(<2 x i64> %x, <2 x i64> %y) {
211; XOP-LABEL: or_fshr_v2i64:
212; XOP:       # %bb.0:
213; XOP-NEXT:    vpsrlq $22, %xmm1, %xmm1
214; XOP-NEXT:    vprotq $42, %xmm0, %xmm0
215; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
216; XOP-NEXT:    retq
217;
218; AVX512-LABEL: or_fshr_v2i64:
219; AVX512:       # %bb.0:
220; AVX512-NEXT:    vpsrlq $22, %xmm1, %xmm1
221; AVX512-NEXT:    vprolq $42, %xmm0, %xmm0
222; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
223; AVX512-NEXT:    retq
224  %or1 = or <2 x i64> %x, %y
225  %sh1 = shl <2 x i64> %x, <i64 42, i64 42>
226  %sh2 = lshr <2 x i64> %or1, <i64 22, i64 22>
227  %r = or <2 x i64> %sh1, %sh2
228  ret <2 x i64> %r
229}
230
231declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
232