xref: /llvm-project/llvm/test/CodeGen/ARM/funnel-shift.ll (revision 76f90a9d71ee0e6d7ad1f9d67a66d97112328f82)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=arm-eabi -mattr=+v6t2 | FileCheck %s --check-prefixes=CHECK,SCALAR
3; RUN: llc < %s -mtriple=arm-eabi -mattr=+v6t2 -mattr=+neon | FileCheck %s --check-prefixes=CHECK,NEON
4
5declare i8 @llvm.fshl.i8(i8, i8, i8)
6declare i16 @llvm.fshl.i16(i16, i16, i16)
7declare i32 @llvm.fshl.i32(i32, i32, i32)
8declare i64 @llvm.fshl.i64(i64, i64, i64)
9declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
10
11declare i8 @llvm.fshr.i8(i8, i8, i8)
12declare i16 @llvm.fshr.i16(i16, i16, i16)
13declare i32 @llvm.fshr.i32(i32, i32, i32)
14declare i64 @llvm.fshr.i64(i64, i64, i64)
15declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
16
17; General case - all operands can be variables.
18
19define i16 @fshl_i16(i16 %x, i16 %y, i16 %z) {
20; CHECK-LABEL: fshl_i16:
21; CHECK:       @ %bb.0:
22; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
23; CHECK-NEXT:    and r1, r2, #15
24; CHECK-NEXT:    lsl r0, r0, r1
25; CHECK-NEXT:    lsr r0, r0, #16
26; CHECK-NEXT:    bx lr
27  %f = call i16 @llvm.fshl.i16(i16 %x, i16 %y, i16 %z)
28  ret i16 %f
29}
30
31define i32 @fshl_i32(i32 %x, i32 %y, i32 %z) {
32; CHECK-LABEL: fshl_i32:
33; CHECK:       @ %bb.0:
34; CHECK-NEXT:    mov r3, #31
35; CHECK-NEXT:    lsr r1, r1, #1
36; CHECK-NEXT:    bic r3, r3, r2
37; CHECK-NEXT:    and r2, r2, #31
38; CHECK-NEXT:    lsl r0, r0, r2
39; CHECK-NEXT:    orr r0, r0, r1, lsr r3
40; CHECK-NEXT:    bx lr
41  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 %z)
42  ret i32 %f
43}
44
45; Verify that weird types are minimally supported.
46declare i37 @llvm.fshl.i37(i37, i37, i37)
47define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
48; SCALAR-LABEL: fshl_i37:
49; SCALAR:       @ %bb.0:
50; SCALAR-NEXT:    .save {r4, r5, r6, r7, r8, r9, r11, lr}
51; SCALAR-NEXT:    push {r4, r5, r6, r7, r8, r9, r11, lr}
52; SCALAR-NEXT:    mov r8, r0
53; SCALAR-NEXT:    ldr r0, [sp, #36]
54; SCALAR-NEXT:    mov r4, r1
55; SCALAR-NEXT:    mov r6, r3
56; SCALAR-NEXT:    and r1, r0, #31
57; SCALAR-NEXT:    ldr r0, [sp, #32]
58; SCALAR-NEXT:    mov r9, r2
59; SCALAR-NEXT:    mov r2, #37
60; SCALAR-NEXT:    mov r3, #0
61; SCALAR-NEXT:    bl __aeabi_uldivmod
62; SCALAR-NEXT:    lsl r1, r6, #27
63; SCALAR-NEXT:    ands r0, r2, #32
64; SCALAR-NEXT:    orr r1, r1, r9, lsr #5
65; SCALAR-NEXT:    mov r3, r8
66; SCALAR-NEXT:    and r6, r2, #31
67; SCALAR-NEXT:    mov r7, #31
68; SCALAR-NEXT:    movne r3, r1
69; SCALAR-NEXT:    cmp r0, #0
70; SCALAR-NEXT:    lslne r1, r9, #27
71; SCALAR-NEXT:    bic r2, r7, r2
72; SCALAR-NEXT:    movne r4, r8
73; SCALAR-NEXT:    lsl r5, r3, r6
74; SCALAR-NEXT:    lsr r0, r1, #1
75; SCALAR-NEXT:    lsl r1, r4, r6
76; SCALAR-NEXT:    lsr r3, r3, #1
77; SCALAR-NEXT:    orr r0, r5, r0, lsr r2
78; SCALAR-NEXT:    orr r1, r1, r3, lsr r2
79; SCALAR-NEXT:    pop {r4, r5, r6, r7, r8, r9, r11, pc}
80;
81; NEON-LABEL: fshl_i37:
82; NEON:       @ %bb.0:
83; NEON-NEXT:    .save {r4, r5, r6, r7, r11, lr}
84; NEON-NEXT:    push {r4, r5, r6, r7, r11, lr}
85; NEON-NEXT:    mov r4, r1
86; NEON-NEXT:    ldr r1, [sp, #28]
87; NEON-NEXT:    mov r6, r0
88; NEON-NEXT:    ldr r0, [sp, #24]
89; NEON-NEXT:    and r1, r1, #31
90; NEON-NEXT:    mov r5, r3
91; NEON-NEXT:    mov r7, r2
92; NEON-NEXT:    mov r2, #37
93; NEON-NEXT:    mov r3, #0
94; NEON-NEXT:    bl __aeabi_uldivmod
95; NEON-NEXT:    mov r0, #31
96; NEON-NEXT:    bic r1, r0, r2
97; NEON-NEXT:    lsl r0, r5, #27
98; NEON-NEXT:    ands r12, r2, #32
99; NEON-NEXT:    orr r0, r0, r7, lsr #5
100; NEON-NEXT:    mov r5, r6
101; NEON-NEXT:    and r2, r2, #31
102; NEON-NEXT:    movne r5, r0
103; NEON-NEXT:    lslne r0, r7, #27
104; NEON-NEXT:    cmp r12, #0
105; NEON-NEXT:    lsl r3, r5, r2
106; NEON-NEXT:    lsr r0, r0, #1
107; NEON-NEXT:    movne r4, r6
108; NEON-NEXT:    orr r0, r3, r0, lsr r1
109; NEON-NEXT:    lsr r3, r5, #1
110; NEON-NEXT:    lsl r2, r4, r2
111; NEON-NEXT:    orr r1, r2, r3, lsr r1
112; NEON-NEXT:    pop {r4, r5, r6, r7, r11, pc}
113  %f = call i37 @llvm.fshl.i37(i37 %x, i37 %y, i37 %z)
114  ret i37 %f
115}
116
117; extract(concat(0b1110000, 0b1111111) << 2) = 0b1000011
118
119declare i7 @llvm.fshl.i7(i7, i7, i7)
120define i7 @fshl_i7_const_fold() {
121; CHECK-LABEL: fshl_i7_const_fold:
122; CHECK:       @ %bb.0:
123; CHECK-NEXT:    mov r0, #67
124; CHECK-NEXT:    bx lr
125  %f = call i7 @llvm.fshl.i7(i7 112, i7 127, i7 2)
126  ret i7 %f
127}
128
129define i8 @fshl_i8_const_fold_overshift_1() {
130; CHECK-LABEL: fshl_i8_const_fold_overshift_1:
131; CHECK:       @ %bb.0:
132; CHECK-NEXT:    mov r0, #128
133; CHECK-NEXT:    bx lr
134  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 15)
135  ret i8 %f
136}
137
138define i8 @fshl_i8_const_fold_overshift_2() {
139; CHECK-LABEL: fshl_i8_const_fold_overshift_2:
140; CHECK:       @ %bb.0:
141; CHECK-NEXT:    mov r0, #120
142; CHECK-NEXT:    bx lr
143  %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11)
144  ret i8 %f
145}
146
147define i8 @fshl_i8_const_fold_overshift_3() {
148; CHECK-LABEL: fshl_i8_const_fold_overshift_3:
149; CHECK:       @ %bb.0:
150; CHECK-NEXT:    mov r0, #0
151; CHECK-NEXT:    bx lr
152  %f = call i8 @llvm.fshl.i8(i8 0, i8 225, i8 8)
153  ret i8 %f
154}
155
156; With constant shift amount, this is 'extr'.
157
158define i32 @fshl_i32_const_shift(i32 %x, i32 %y) {
159; CHECK-LABEL: fshl_i32_const_shift:
160; CHECK:       @ %bb.0:
161; CHECK-NEXT:    lsl r0, r0, #9
162; CHECK-NEXT:    orr r0, r0, r1, lsr #23
163; CHECK-NEXT:    bx lr
164  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 9)
165  ret i32 %f
166}
167
168; Check modulo math on shift amount.
169
170define i32 @fshl_i32_const_overshift(i32 %x, i32 %y) {
171; CHECK-LABEL: fshl_i32_const_overshift:
172; CHECK:       @ %bb.0:
173; CHECK-NEXT:    lsl r0, r0, #9
174; CHECK-NEXT:    orr r0, r0, r1, lsr #23
175; CHECK-NEXT:    bx lr
176  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 41)
177  ret i32 %f
178}
179
180; 64-bit should also work.
181
182define i64 @fshl_i64_const_overshift(i64 %x, i64 %y) {
183; CHECK-LABEL: fshl_i64_const_overshift:
184; CHECK:       @ %bb.0:
185; CHECK-NEXT:    lsl r1, r3, #9
186; CHECK-NEXT:    orr r2, r1, r2, lsr #23
187; CHECK-NEXT:    lsl r0, r0, #9
188; CHECK-NEXT:    orr r1, r0, r3, lsr #23
189; CHECK-NEXT:    mov r0, r2
190; CHECK-NEXT:    bx lr
191  %f = call i64 @llvm.fshl.i64(i64 %x, i64 %y, i64 105)
192  ret i64 %f
193}
194
195; This should work without any node-specific logic.
196
197define i8 @fshl_i8_const_fold() {
198; CHECK-LABEL: fshl_i8_const_fold:
199; CHECK:       @ %bb.0:
200; CHECK-NEXT:    mov r0, #128
201; CHECK-NEXT:    bx lr
202  %f = call i8 @llvm.fshl.i8(i8 255, i8 0, i8 7)
203  ret i8 %f
204}
205
206; Repeat everything for funnel shift right.
207
208; General case - all operands can be variables.
209
210define i16 @fshr_i16(i16 %x, i16 %y, i16 %z) {
211; CHECK-LABEL: fshr_i16:
212; CHECK:       @ %bb.0:
213; CHECK-NEXT:    pkhbt r0, r1, r0, lsl #16
214; CHECK-NEXT:    and r1, r2, #15
215; CHECK-NEXT:    lsr r0, r0, r1
216; CHECK-NEXT:    bx lr
217  %f = call i16 @llvm.fshr.i16(i16 %x, i16 %y, i16 %z)
218  ret i16 %f
219}
220
221define i32 @fshr_i32(i32 %x, i32 %y, i32 %z) {
222; CHECK-LABEL: fshr_i32:
223; CHECK:       @ %bb.0:
224; CHECK-NEXT:    mov r3, #31
225; CHECK-NEXT:    lsl r0, r0, #1
226; CHECK-NEXT:    bic r3, r3, r2
227; CHECK-NEXT:    and r2, r2, #31
228; CHECK-NEXT:    lsl r0, r0, r3
229; CHECK-NEXT:    orr r0, r0, r1, lsr r2
230; CHECK-NEXT:    bx lr
231  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 %z)
232  ret i32 %f
233}
234
235; Verify that weird types are minimally supported.
236declare i37 @llvm.fshr.i37(i37, i37, i37)
237define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
238; SCALAR-LABEL: fshr_i37:
239; SCALAR:       @ %bb.0:
240; SCALAR-NEXT:    .save {r4, r5, r6, r7, r8, lr}
241; SCALAR-NEXT:    push {r4, r5, r6, r7, r8, lr}
242; SCALAR-NEXT:    mov r8, r0
243; SCALAR-NEXT:    ldr r0, [sp, #28]
244; SCALAR-NEXT:    mov r4, r1
245; SCALAR-NEXT:    mov r5, r3
246; SCALAR-NEXT:    and r1, r0, #31
247; SCALAR-NEXT:    ldr r0, [sp, #24]
248; SCALAR-NEXT:    mov r7, r2
249; SCALAR-NEXT:    mov r2, #37
250; SCALAR-NEXT:    mov r3, #0
251; SCALAR-NEXT:    bl __aeabi_uldivmod
252; SCALAR-NEXT:    lsl r3, r5, #27
253; SCALAR-NEXT:    add r0, r2, #27
254; SCALAR-NEXT:    orr r3, r3, r7, lsr #5
255; SCALAR-NEXT:    ands r2, r0, #32
256; SCALAR-NEXT:    mov r5, r8
257; SCALAR-NEXT:    mov r1, #31
258; SCALAR-NEXT:    moveq r5, r3
259; SCALAR-NEXT:    lsleq r3, r7, #27
260; SCALAR-NEXT:    cmp r2, #0
261; SCALAR-NEXT:    bic r1, r1, r0
262; SCALAR-NEXT:    moveq r4, r8
263; SCALAR-NEXT:    lsl r6, r5, #1
264; SCALAR-NEXT:    and r7, r0, #31
265; SCALAR-NEXT:    lsl r2, r4, #1
266; SCALAR-NEXT:    lsl r6, r6, r1
267; SCALAR-NEXT:    lsl r1, r2, r1
268; SCALAR-NEXT:    orr r0, r6, r3, lsr r7
269; SCALAR-NEXT:    orr r1, r1, r5, lsr r7
270; SCALAR-NEXT:    pop {r4, r5, r6, r7, r8, pc}
271;
272; NEON-LABEL: fshr_i37:
273; NEON:       @ %bb.0:
274; NEON-NEXT:    .save {r4, r5, r6, r7, r8, lr}
275; NEON-NEXT:    push {r4, r5, r6, r7, r8, lr}
276; NEON-NEXT:    mov r4, r1
277; NEON-NEXT:    ldr r1, [sp, #28]
278; NEON-NEXT:    mov r8, r0
279; NEON-NEXT:    ldr r0, [sp, #24]
280; NEON-NEXT:    and r1, r1, #31
281; NEON-NEXT:    mov r5, r3
282; NEON-NEXT:    mov r7, r2
283; NEON-NEXT:    mov r2, #37
284; NEON-NEXT:    mov r3, #0
285; NEON-NEXT:    bl __aeabi_uldivmod
286; NEON-NEXT:    lsl r3, r5, #27
287; NEON-NEXT:    add r0, r2, #27
288; NEON-NEXT:    orr r3, r3, r7, lsr #5
289; NEON-NEXT:    ands r2, r0, #32
290; NEON-NEXT:    mov r5, r8
291; NEON-NEXT:    mov r1, #31
292; NEON-NEXT:    moveq r5, r3
293; NEON-NEXT:    lsleq r3, r7, #27
294; NEON-NEXT:    cmp r2, #0
295; NEON-NEXT:    bic r1, r1, r0
296; NEON-NEXT:    moveq r4, r8
297; NEON-NEXT:    lsl r6, r5, #1
298; NEON-NEXT:    and r7, r0, #31
299; NEON-NEXT:    lsl r2, r4, #1
300; NEON-NEXT:    lsl r6, r6, r1
301; NEON-NEXT:    lsl r1, r2, r1
302; NEON-NEXT:    orr r0, r6, r3, lsr r7
303; NEON-NEXT:    orr r1, r1, r5, lsr r7
304; NEON-NEXT:    pop {r4, r5, r6, r7, r8, pc}
305  %f = call i37 @llvm.fshr.i37(i37 %x, i37 %y, i37 %z)
306  ret i37 %f
307}
308
309; extract(concat(0b1110000, 0b1111111) >> 2) = 0b0011111
310
311declare i7 @llvm.fshr.i7(i7, i7, i7)
312define i7 @fshr_i7_const_fold() {
313; CHECK-LABEL: fshr_i7_const_fold:
314; CHECK:       @ %bb.0:
315; CHECK-NEXT:    mov r0, #31
316; CHECK-NEXT:    bx lr
317  %f = call i7 @llvm.fshr.i7(i7 112, i7 127, i7 2)
318  ret i7 %f
319}
320
321define i8 @fshr_i8_const_fold_overshift_1() {
322; CHECK-LABEL: fshr_i8_const_fold_overshift_1:
323; CHECK:       @ %bb.0:
324; CHECK-NEXT:    mov r0, #254
325; CHECK-NEXT:    bx lr
326  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 15)
327  ret i8 %f
328}
329
330define i8 @fshr_i8_const_fold_overshift_2() {
331; CHECK-LABEL: fshr_i8_const_fold_overshift_2:
332; CHECK:       @ %bb.0:
333; CHECK-NEXT:    mov r0, #225
334; CHECK-NEXT:    bx lr
335  %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)
336  ret i8 %f
337}
338
339define i8 @fshr_i8_const_fold_overshift_3() {
340; CHECK-LABEL: fshr_i8_const_fold_overshift_3:
341; CHECK:       @ %bb.0:
342; CHECK-NEXT:    mov r0, #255
343; CHECK-NEXT:    bx lr
344  %f = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8)
345  ret i8 %f
346}
347
348; With constant shift amount, this is 'extr'.
349
350define i32 @fshr_i32_const_shift(i32 %x, i32 %y) {
351; CHECK-LABEL: fshr_i32_const_shift:
352; CHECK:       @ %bb.0:
353; CHECK-NEXT:    lsl r0, r0, #23
354; CHECK-NEXT:    orr r0, r0, r1, lsr #9
355; CHECK-NEXT:    bx lr
356  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 9)
357  ret i32 %f
358}
359
360; Check modulo math on shift amount. 41-32=9.
361
362define i32 @fshr_i32_const_overshift(i32 %x, i32 %y) {
363; CHECK-LABEL: fshr_i32_const_overshift:
364; CHECK:       @ %bb.0:
365; CHECK-NEXT:    lsl r0, r0, #23
366; CHECK-NEXT:    orr r0, r0, r1, lsr #9
367; CHECK-NEXT:    bx lr
368  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 41)
369  ret i32 %f
370}
371
372; 64-bit should also work. 105-64 = 41.
373
374define i64 @fshr_i64_const_overshift(i64 %x, i64 %y) {
375; CHECK-LABEL: fshr_i64_const_overshift:
376; CHECK:       @ %bb.0:
377; CHECK-NEXT:    lsl r2, r0, #23
378; CHECK-NEXT:    lsl r1, r1, #23
379; CHECK-NEXT:    orr r2, r2, r3, lsr #9
380; CHECK-NEXT:    orr r1, r1, r0, lsr #9
381; CHECK-NEXT:    mov r0, r2
382; CHECK-NEXT:    bx lr
383  %f = call i64 @llvm.fshr.i64(i64 %x, i64 %y, i64 105)
384  ret i64 %f
385}
386
387; This should work without any node-specific logic.
388
389define i8 @fshr_i8_const_fold() {
390; CHECK-LABEL: fshr_i8_const_fold:
391; CHECK:       @ %bb.0:
392; CHECK-NEXT:    mov r0, #254
393; CHECK-NEXT:    bx lr
394  %f = call i8 @llvm.fshr.i8(i8 255, i8 0, i8 7)
395  ret i8 %f
396}
397
398define i32 @fshl_i32_shift_by_bitwidth(i32 %x, i32 %y) {
399; CHECK-LABEL: fshl_i32_shift_by_bitwidth:
400; CHECK:       @ %bb.0:
401; CHECK-NEXT:    bx lr
402  %f = call i32 @llvm.fshl.i32(i32 %x, i32 %y, i32 32)
403  ret i32 %f
404}
405
406define i32 @fshr_i32_shift_by_bitwidth(i32 %x, i32 %y) {
407; CHECK-LABEL: fshr_i32_shift_by_bitwidth:
408; CHECK:       @ %bb.0:
409; CHECK-NEXT:    mov r0, r1
410; CHECK-NEXT:    bx lr
411  %f = call i32 @llvm.fshr.i32(i32 %x, i32 %y, i32 32)
412  ret i32 %f
413}
414
415define <4 x i32> @fshl_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
416; CHECK-LABEL: fshl_v4i32_shift_by_bitwidth:
417; CHECK:       @ %bb.0:
418; CHECK-NEXT:    bx lr
419  %f = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
420  ret <4 x i32> %f
421}
422
423define <4 x i32> @fshr_v4i32_shift_by_bitwidth(<4 x i32> %x, <4 x i32> %y) {
424; SCALAR-LABEL: fshr_v4i32_shift_by_bitwidth:
425; SCALAR:       @ %bb.0:
426; SCALAR-NEXT:    ldm sp, {r0, r1, r2, r3}
427; SCALAR-NEXT:    bx lr
428;
429; NEON-LABEL: fshr_v4i32_shift_by_bitwidth:
430; NEON:       @ %bb.0:
431; NEON-NEXT:    mov r0, sp
432; NEON-NEXT:    vld1.64 {d16, d17}, [r0]
433; NEON-NEXT:    vmov r0, r1, d16
434; NEON-NEXT:    vmov r2, r3, d17
435; NEON-NEXT:    bx lr
436  %f = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 32, i32 32, i32 32, i32 32>)
437  ret <4 x i32> %f
438}
439
440