xref: /llvm-project/llvm/test/CodeGen/ARM/funnel-shift.ll (revision e0ed0333f0fed2e73f805afd58b61176a87aa3ad)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -mattr=+v6t2 | FileCheck %s --check-prefixes=CHECK,SCALAR
3; RUN: llc < %s -mtriple=arm-eabi -mattr=+v6t2 -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON
4
5declare i8 @llvm.fshl.i8(i8, i8, i8)
6declare i16 @llvm.fshl.i16(i16, i16, i16)
7declare i32 @llvm.fshl.i32(i32, i32, i32)
8declare i64 @llvm.fshl.i64(i64, i64, i64)
9declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
10
11declare i8 @llvm.fshr.i8(i8, i8, i8)
12declare i16 @llvm.fshr.i16(i16, i16, i16)
13declare i32 @llvm.fshr.i32(i32, i32, i32)
14declare i64 @llvm.fshr.i64(i64, i64, i64)
15declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
16
17; General case - all operands can be variables.
18
19define i16 @fshl_i16(i16 %x, i16 %y, i16 %z) {
20; CHECK-LABEL: fshl_i16:
21; CHECK:       @ %bb.0:
22; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
23; CHECK-NEXT:    and r1, r2, #15
24; CHECK-NEXT:    lsl r0, r0, r1
25; CHECK-NEXT:    lsr r0, r0, #16
26; CHECK-NEXT:    bx lr
27  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
28  ret i16 %f
29}
30
31define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
32; CHECK-LABEL: fshl_i32:
33; CHECK:       @ %bb.0:
34; CHECK-NEXT:    mov r3, #31
35; CHECK-NEXT:    lsr r1, r1, #1
36; CHECK-NEXT:    bic r3, r3, r2
37; CHECK-NEXT:    and r2, r2, #31
38; CHECK-NEXT:    lsl r0, r0, r2
39; CHECK-NEXT:    orr r0, r0, r1, lsr r3
40; CHECK-NEXT:    bx lr
41  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
42  ret i32 %f
43}
44
45; Verify that weird types are minimally supported.
46declare i37 @llvm.fshl.i37(i37, i37, i37)
47define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
48; SCALAR-LABEL: fshl_i37:
49; SCALAR:       @ %bb.0:
50; SCALAR-NEXT:    .save {r4, r5, r6, r7, r8, lr}
51; SCALAR-NEXT:    push {r4, r5, r6, r7, r8, lr}
52; SCALAR-NEXT:    mov r8, r0
53; SCALAR-NEXT:    ldr r0, [sp, #28]
54; SCALAR-NEXT:    mov r4, r1
55; SCALAR-NEXT:    mov r5, r3
56; SCALAR-NEXT:    and r1, r0, #31
57; SCALAR-NEXT:    ldr r0, [sp, #24]
58; SCALAR-NEXT:    mov r6, r2
59; SCALAR-NEXT:    mov r2, #37
60; SCALAR-NEXT:    mov r3, #0
61; SCALAR-NEXT:    bl __aeabi_uldivmod
62; SCALAR-NEXT:    lsl r0, r5, #27
63; SCALAR-NEXT:    tst r2, #32
64; SCALAR-NEXT:    orr r0, r0, r6, lsr #5
65; SCALAR-NEXT:    mov r1, r8
66; SCALAR-NEXT:    and r3, r2, #31
67; SCALAR-NEXT:    mov r7, #31
68; SCALAR-NEXT:    movne r1, r0
69; SCALAR-NEXT:    lslne r0, r6, #27
70; SCALAR-NEXT:    bic r2, r7, r2
71; SCALAR-NEXT:    lsl r5, r1, r3
72; SCALAR-NEXT:    lsr r0, r0, #1
73; SCALAR-NEXT:    movne r4, r8
74; SCALAR-NEXT:    lsr r1, r1, #1
75; SCALAR-NEXT:    lsl r3, r4, r3
76; SCALAR-NEXT:    orr r0, r5, r0, lsr r2
77; SCALAR-NEXT:    orr r1, r3, r1, lsr r2
78; SCALAR-NEXT:    pop {r4, r5, r6, r7, r8, pc}
79;
80; NEON-LABEL: fshl_i37:
81; NEON:       @ %bb.0:
82; NEON-NEXT:    .save {r4, r5, r6, r7, r8, lr}
83; NEON-NEXT:    push {r4, r5, r6, r7, r8, lr}
84; NEON-NEXT:    mov r4, r1
85; NEON-NEXT:    ldr r1, [sp, #28]
86; NEON-NEXT:    mov r8, r0
87; NEON-NEXT:    ldr r0, [sp, #24]
88; NEON-NEXT:    and r1, r1, #31
89; NEON-NEXT:    mov r5, r3
90; NEON-NEXT:    mov r6, r2
91; NEON-NEXT:    mov r2, #37
92; NEON-NEXT:    mov r3, #0
93; NEON-NEXT:    bl __aeabi_uldivmod
94; NEON-NEXT:    lsl r0, r5, #27
95; NEON-NEXT:    tst r2, #32
96; NEON-NEXT:    orr r0, r0, r6, lsr #5
97; NEON-NEXT:    mov r1, r8
98; NEON-NEXT:    and r3, r2, #31
99; NEON-NEXT:    mov r7, #31
100; NEON-NEXT:    movne r1, r0
101; NEON-NEXT:    lslne r0, r6, #27
102; NEON-NEXT:    bic r2, r7, r2
103; NEON-NEXT:    lsl r5, r1, r3
104; NEON-NEXT:    lsr r0, r0, #1
105; NEON-NEXT:    movne r4, r8
106; NEON-NEXT:    lsr r1, r1, #1
107; NEON-NEXT:    lsl r3, r4, r3
108; NEON-NEXT:    orr r0, r5, r0, lsr r2
109; NEON-NEXT:    orr r1, r3, r1, lsr r2
110; NEON-NEXT:    pop {r4, r5, r6, r7, r8, pc}
111  %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
112  ret i37 %f
113}
114
115; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
116
117declare i7 @llvm.fshl.i7(i7, i7, i7)
118define i7 @fshl_i7_const_fold() {
119; CHECK-LABEL: fshl_i7_const_fold:
120; CHECK:       @ %bb.0:
121; CHECK-NEXT:    mov r0, #67
122; CHECK-NEXT:    bx lr
123  %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
124  ret i7 %f
125}
126
127define i8 @fshl_i8_const_fold_overshift_1() {
128; CHECK-LABEL: fshl_i8_const_fold_overshift_1:
129; CHECK:       @ %bb.0:
130; CHECK-NEXT:    mov r0, #128
131; CHECK-NEXT:    bx lr
132  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15)
133  ret i8 %f
134}
135
136define i8 @fshl_i8_const_fold_overshift_2() {
137; CHECK-LABEL: fshl_i8_const_fold_overshift_2:
138; CHECK:       @ %bb.0:
139; CHECK-NEXT:    mov r0, #120
140; CHECK-NEXT:    bx lr
141  %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11)
142  ret i8 %f
143}
144
145define i8 @fshl_i8_const_fold_overshift_3() {
146; CHECK-LABEL: fshl_i8_const_fold_overshift_3:
147; CHECK:       @ %bb.0:
148; CHECK-NEXT:    mov r0, #0
149; CHECK-NEXT:    bx lr
150  %f = call i8 @llvm.fshl.i8(i8 0, i8 225, i8 8)
151  ret i8 %f
152}
153
154; With constant shift amount, this is 'extr'.
155
156define i32 @fshl_i32_const_shift(i32 %x, i32 %y) {
157; CHECK-LABEL: fshl_i32_const_shift:
158; CHECK:       @ %bb.0:
159; CHECK-NEXT:    lsl r0, r0, #9
160; CHECK-NEXT:    orr r0, r0, r1, lsr #23
161; CHECK-NEXT:    bx lr
162  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
163  ret i32 %f
164}
165
166; Check modulo math on shift amount.
167
168define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) {
169; CHECK-LABEL: fshl_i32_const_overshift:
170; CHECK:       @ %bb.0:
171; CHECK-NEXT:    lsl r0, r0, #9
172; CHECK-NEXT:    orr r0, r0, r1, lsr #23
173; CHECK-NEXT:    bx lr
174  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
175  ret i32 %f
176}
177
178; 64-bit should also work.
179
180define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
181; CHECK-LABEL: fshl_i64_const_overshift:
182; CHECK:       @ %bb.0:
183; CHECK-NEXT:    lsl r1, r3, #9
184; CHECK-NEXT:    orr r2, r1, r2, lsr #23
185; CHECK-NEXT:    lsl r0, r0, #9
186; CHECK-NEXT:    orr r1, r0, r3, lsr #23
187; CHECK-NEXT:    mov r0, r2
188; CHECK-NEXT:    bx lr
189  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
190  ret i64 %f
191}
192
193; This should work without any node-specific logic.
194
195define i8 @fshl_i8_const_fold() {
196; CHECK-LABEL: fshl_i8_const_fold:
197; CHECK:       @ %bb.0:
198; CHECK-NEXT:    mov r0, #128
199; CHECK-NEXT:    bx lr
200  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
201  ret i8 %f
202}
203
204; Repeat everything for funnel shift right.
205
206; General case - all operands can be variables.
207
208define i16 @fshr_i16(i16 %x, i16 %y, i16 %z) {
209; CHECK-LABEL: fshr_i16:
210; CHECK:       @ %bb.0:
211; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
212; CHECK-NEXT:    and r1, r2, #15
213; CHECK-NEXT:    lsr r0, r0, r1
214; CHECK-NEXT:    bx lr
215  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
216  ret i16 %f
217}
218
219define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
220; CHECK-LABEL: fshr_i32:
221; CHECK:       @ %bb.0:
222; CHECK-NEXT:    mov r3, #31
223; CHECK-NEXT:    lsl r0, r0, #1
224; CHECK-NEXT:    bic r3, r3, r2
225; CHECK-NEXT:    and r2, r2, #31
226; CHECK-NEXT:    lsl r0, r0, r3
227; CHECK-NEXT:    orr r0, r0, r1, lsr r2
228; CHECK-NEXT:    bx lr
229  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
230  ret i32 %f
231}
232
233; Verify that weird types are minimally supported.
234declare i37 @llvm.fshr.i37(i37, i37, i37)
235define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
236; SCALAR-LABEL: fshr_i37:
237; SCALAR:       @ %bb.0:
238; SCALAR-NEXT:    .save {r4, r5, r6, r7, r11, lr}
239; SCALAR-NEXT:    push {r4, r5, r6, r7, r11, lr}
240; SCALAR-NEXT:    mov r5, r0
241; SCALAR-NEXT:    ldr r0, [sp, #28]
242; SCALAR-NEXT:    mov r4, r1
243; SCALAR-NEXT:    mov r6, r3
244; SCALAR-NEXT:    and r1, r0, #31
245; SCALAR-NEXT:    ldr r0, [sp, #24]
246; SCALAR-NEXT:    mov r7, r2
247; SCALAR-NEXT:    mov r2, #37
248; SCALAR-NEXT:    mov r3, #0
249; SCALAR-NEXT:    bl __aeabi_uldivmod
250; SCALAR-NEXT:    add r0, r2, #27
251; SCALAR-NEXT:    lsl r2, r6, #27
252; SCALAR-NEXT:    orr r2, r2, r7, lsr #5
253; SCALAR-NEXT:    mov r1, #31
254; SCALAR-NEXT:    tst r0, #32
255; SCALAR-NEXT:    mov r3, r5
256; SCALAR-NEXT:    moveq r3, r2
257; SCALAR-NEXT:    lsleq r2, r7, #27
258; SCALAR-NEXT:    bic r1, r1, r0
259; SCALAR-NEXT:    and r7, r0, #31
260; SCALAR-NEXT:    lsl r6, r3, #1
261; SCALAR-NEXT:    moveq r4, r5
262; SCALAR-NEXT:    lsl r6, r6, r1
263; SCALAR-NEXT:    orr r0, r6, r2, lsr r7
264; SCALAR-NEXT:    lsl r2, r4, #1
265; SCALAR-NEXT:    lsl r1, r2, r1
266; SCALAR-NEXT:    orr r1, r1, r3, lsr r7
267; SCALAR-NEXT:    pop {r4, r5, r6, r7, r11, pc}
268;
269; NEON-LABEL: fshr_i37:
270; NEON:       @ %bb.0:
271; NEON-NEXT:    .save {r4, r5, r6, r7, r11, lr}
272; NEON-NEXT:    push {r4, r5, r6, r7, r11, lr}
273; NEON-NEXT:    mov r4, r1
274; NEON-NEXT:    ldr r1, [sp, #28]
275; NEON-NEXT:    mov r5, r0
276; NEON-NEXT:    ldr r0, [sp, #24]
277; NEON-NEXT:    and r1, r1, #31
278; NEON-NEXT:    mov r6, r3
279; NEON-NEXT:    mov r7, r2
280; NEON-NEXT:    mov r2, #37
281; NEON-NEXT:    mov r3, #0
282; NEON-NEXT:    bl __aeabi_uldivmod
283; NEON-NEXT:    add r0, r2, #27
284; NEON-NEXT:    lsl r2, r6, #27
285; NEON-NEXT:    orr r2, r2, r7, lsr #5
286; NEON-NEXT:    mov r1, #31
287; NEON-NEXT:    tst r0, #32
288; NEON-NEXT:    mov r3, r5
289; NEON-NEXT:    moveq r3, r2
290; NEON-NEXT:    lsleq r2, r7, #27
291; NEON-NEXT:    bic r1, r1, r0
292; NEON-NEXT:    and r7, r0, #31
293; NEON-NEXT:    lsl r6, r3, #1
294; NEON-NEXT:    moveq r4, r5
295; NEON-NEXT:    lsl r6, r6, r1
296; NEON-NEXT:    orr r0, r6, r2, lsr r7
297; NEON-NEXT:    lsl r2, r4, #1
298; NEON-NEXT:    lsl r1, r2, r1
299; NEON-NEXT:    orr r1, r1, r3, lsr r7
300; NEON-NEXT:    pop {r4, r5, r6, r7, r11, pc}
301  %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
302  ret i37 %f
303}
304
305; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
306
307declare i7 @llvm.fshr.i7(i7, i7, i7)
308define i7 @fshr_i7_const_fold() {
309; CHECK-LABEL: fshr_i7_const_fold:
310; CHECK:       @ %bb.0:
311; CHECK-NEXT:    mov r0, #31
312; CHECK-NEXT:    bx lr
313  %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
314  ret i7 %f
315}
316
317define i8 @fshr_i8_const_fold_overshift_1() {
318; CHECK-LABEL: fshr_i8_const_fold_overshift_1:
319; CHECK:       @ %bb.0:
320; CHECK-NEXT:    mov r0, #254
321; CHECK-NEXT:    bx lr
322  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15)
323  ret i8 %f
324}
325
326define i8 @fshr_i8_const_fold_overshift_2() {
327; CHECK-LABEL: fshr_i8_const_fold_overshift_2:
328; CHECK:       @ %bb.0:
329; CHECK-NEXT:    mov r0, #225
330; CHECK-NEXT:    bx lr
331  %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)
332  ret i8 %f
333}
334
335define i8 @fshr_i8_const_fold_overshift_3() {
336; CHECK-LABEL: fshr_i8_const_fold_overshift_3:
337; CHECK:       @ %bb.0:
338; CHECK-NEXT:    mov r0, #255
339; CHECK-NEXT:    bx lr
340  %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)
341  ret i8 %f
342}
343
344; With constant shift amount, this is 'extr'.
345
346define i32 @fshr_i32_const_shift(i32 %x, i32 %y) {
347; CHECK-LABEL: fshr_i32_const_shift:
348; CHECK:       @ %bb.0:
349; CHECK-NEXT:    lsl r0, r0, #23
350; CHECK-NEXT:    orr r0, r0, r1, lsr #9
351; CHECK-NEXT:    bx lr
352  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
353  ret i32 %f
354}
355
356; Check modulo math on shift amount. 41-32=9.
357
358define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
359; CHECK-LABEL: fshr_i32_const_overshift:
360; CHECK:       @ %bb.0:
361; CHECK-NEXT:    lsl r0, r0, #23
362; CHECK-NEXT:    orr r0, r0, r1, lsr #9
363; CHECK-NEXT:    bx lr
364  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
365  ret i32 %f
366}
367
368; 64-bit should also work. 105-64 = 41.
369
370define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
371; CHECK-LABEL: fshr_i64_const_overshift:
372; CHECK:       @ %bb.0:
373; CHECK-NEXT:    lsl r2, r0, #23
374; CHECK-NEXT:    lsl r1, r1, #23
375; CHECK-NEXT:    orr r2, r2, r3, lsr #9
376; CHECK-NEXT:    orr r1, r1, r0, lsr #9
377; CHECK-NEXT:    mov r0, r2
378; CHECK-NEXT:    bx lr
379  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
380  ret i64 %f
381}
382
383; This should work without any node-specific logic.
384
385define i8 @fshr_i8_const_fold() {
386; CHECK-LABEL: fshr_i8_const_fold:
387; CHECK:       @ %bb.0:
388; CHECK-NEXT:    mov r0, #254
389; CHECK-NEXT:    bx lr
390  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
391  ret i8 %f
392}
393
394define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) {
395; CHECK-LABEL: fshl_i32_shift_by_bitwidth:
396; CHECK:       @ %bb.0:
397; CHECK-NEXT:    bx lr
398  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
399  ret i32 %f
400}
401
402define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) {
403; CHECK-LABEL: fshr_i32_shift_by_bitwidth:
404; CHECK:       @ %bb.0:
405; CHECK-NEXT:    mov r0, r1
406; CHECK-NEXT:    bx lr
407  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
408  ret i32 %f
409}
410
411define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
412; CHECK-LABEL: fshl_v4i32_shift_by_bitwidth:
413; CHECK:       @ %bb.0:
414; CHECK-NEXT:    bx lr
415  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
416  ret <4 x i32> %f
417}
418
419define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
420; SCALAR-LABEL: fshr_v4i32_shift_by_bitwidth:
421; SCALAR:       @ %bb.0:
422; SCALAR-NEXT:    ldm sp, {r0, r1, r2, r3}
423; SCALAR-NEXT:    bx lr
424;
425; NEON-LABEL: fshr_v4i32_shift_by_bitwidth:
426; NEON:       @ %bb.0:
427; NEON-NEXT:    mov r0, sp
428; NEON-NEXT:    vld1.64 {d16, d17}, [r0]
429; NEON-NEXT:    vmov r0, r1, d16
430; NEON-NEXT:    vmov r2, r3, d17
431; NEON-NEXT:    bx lr
432  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
433  ret <4 x i32> %f
434}
435
436