xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm64/lshift.asm (revision 9fd8799cb5ceb66c69f2eb1a6d26a1d587ba1f1e)
1dnl  ARM64 mpn_lshift.
2
3dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C	     cycles/limb   assumed optimal c/l
23C Cortex-A53	3.5-4.0		 3.25
24C Cortex-A57	 2.0		 2.0
25C X-Gene	 2.67		 2.5
26
27C TODO
28C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
29C    numbers should be 1 and 0, respectively.  The str in wind-down should also
30C    go.
31C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
32C  * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0.
33
34changecom(blah)
35
36define(`rp_arg', `x0')
37define(`up',     `x1')
38define(`n',      `x2')
39define(`cnt',    `x3')
40
41define(`rp',     `x16')
42
43define(`tnc',`x8')
44
45define(`PSHIFT', lsl)
46define(`NSHIFT', lsr)
47
48ASM_START()
49PROLOGUE(mpn_lshift)
50	add	rp, rp_arg, n, lsl #3
51	add	up, up, n, lsl #3
52	sub	tnc, xzr, cnt
53	lsr	x18, n, #2
54	tbz	n, #0, L(bx0)
55
56L(bx1):	ldr	x4, [up,#-8]
57	tbnz	n, #1, L(b11)
58
59L(b01):	NSHIFT	x0, x4, tnc
60	PSHIFT	x2, x4, cnt
61	cbnz	x18, L(gt1)
62	str	x2, [rp,#-8]
63	ret
64L(gt1):	ldp	x4, x5, [up,#-24]
65	sub	up, up, #8
66	add	rp, rp, #16
67	b	L(lo2)
68
69L(b11):	NSHIFT	x0, x4, tnc
70	PSHIFT	x2, x4, cnt
71	ldp	x6, x7, [up,#-24]!
72	b	L(lo3)
73
74L(bx0):	ldp	x4, x5, [up,#-16]
75	tbz	n, #1, L(b00)
76
77L(b10):	NSHIFT	x0, x5, tnc
78	PSHIFT	x13, x5, cnt
79	NSHIFT	x10, x4, tnc
80	PSHIFT	x2, x4, cnt
81	cbnz	x18, L(gt2)
82	orr	x10, x10, x13
83	stp	x2, x10, [rp,#-16]
84	ret
85L(gt2):	ldp	x4, x5, [up,#-32]
86	orr	x10, x10, x13
87	str	x10, [rp,#-8]
88	sub	up, up, #16
89	add	rp, rp, #8
90	b	L(lo2)
91
92L(b00):	NSHIFT	x0, x5, tnc
93	PSHIFT	x13, x5, cnt
94	NSHIFT	x10, x4, tnc
95	PSHIFT	x2, x4, cnt
96	ldp	x6, x7, [up,#-32]!
97	orr	x10, x10, x13
98	str	x10, [rp,#-8]!
99	b	L(lo0)
100
101	ALIGN(16)
102L(top):	ldp	x4, x5, [up,#-16]
103	orr	x10, x10, x13
104	orr	x11, x12, x2
105	stp	x10, x11, [rp,#-16]
106	PSHIFT	x2, x6, cnt
107L(lo2):	NSHIFT	x10, x4, tnc
108	PSHIFT	x13, x5, cnt
109	NSHIFT	x12, x5, tnc
110	ldp	x6, x7, [up,#-32]!
111	orr	x10, x10, x13
112	orr	x11, x12, x2
113	stp	x10, x11, [rp,#-32]!
114	PSHIFT	x2, x4, cnt
115L(lo0):	sub	x18, x18, #1
116L(lo3):	NSHIFT	x10, x6, tnc
117	PSHIFT	x13, x7, cnt
118	NSHIFT	x12, x7, tnc
119	cbnz	x18, L(top)
120
121L(end):	orr	x10, x10, x13
122	orr	x11, x12, x2
123	PSHIFT	x2, x6, cnt
124	stp	x10, x11, [rp,#-16]
125	str	x2, [rp,#-24]
126	ret
127EPILOGUE()
128