xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm64/lshiftc.asm (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1dnl  ARM64 mpn_lshiftc.
2
3dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C	     cycles/limb   assumed optimal c/l
23C Cortex-A53	3.5-4.0		 3.25
24C Cortex-A57	 2.0		 2.0
25C X-Gene	 2.67		 2.5
26
27C TODO
28C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
29C    numbers should be 1 and 0, respectively.  The str in wind-down should also
30C    go.
31C  * Using extr and with 63 separate loops we might reach 1.5 c/l on A57.
32C  * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0.
33
34changecom(blah)
35
36define(`rp_arg', `x0')
37define(`up',     `x1')
38define(`n',      `x2')
39define(`cnt',    `x3')
40
41define(`rp',     `x16')
42
43define(`tnc',`x8')
44
45define(`PSHIFT', lsl)
46define(`NSHIFT', lsr)
47
48ASM_START()
49PROLOGUE(mpn_lshiftc)
50	add	rp, rp_arg, n, lsl #3
51	add	up, up, n, lsl #3
52	sub	tnc, xzr, cnt
53	lsr	x18, n, #2
54	tbz	n, #0, L(bx0)
55
56L(bx1):	ldr	x4, [up,#-8]
57	tbnz	n, #1, L(b11)
58
59L(b01):	NSHIFT	x0, x4, tnc
60	PSHIFT	x2, x4, cnt
61	cbnz	x18, L(gt1)
62	mvn	x2, x2
63	str	x2, [rp,#-8]
64	ret
65L(gt1):	ldp	x4, x5, [up,#-24]
66	sub	up, up, #8
67	add	rp, rp, #16
68	b	L(lo2)
69
70L(b11):	NSHIFT	x0, x4, tnc
71	PSHIFT	x2, x4, cnt
72	ldp	x6, x7, [up,#-24]!
73	b	L(lo3)
74
75L(bx0):	ldp	x4, x5, [up,#-16]
76	tbz	n, #1, L(b00)
77
78L(b10):	NSHIFT	x0, x5, tnc
79	PSHIFT	x13, x5, cnt
80	NSHIFT	x10, x4, tnc
81	PSHIFT	x2, x4, cnt
82	cbnz	x18, L(gt2)
83	eon	x10, x10, x13
84	mvn	x2, x2
85	stp	x2, x10, [rp,#-16]
86	ret
87L(gt2):	ldp	x4, x5, [up,#-32]
88	eon	x10, x10, x13
89	str	x10, [rp,#-8]
90	sub	up, up, #16
91	add	rp, rp, #8
92	b	L(lo2)
93
94L(b00):	NSHIFT	x0, x5, tnc
95	PSHIFT	x13, x5, cnt
96	NSHIFT	x10, x4, tnc
97	PSHIFT	x2, x4, cnt
98	ldp	x6, x7, [up,#-32]!
99	eon	x10, x10, x13
100	str	x10, [rp,#-8]!
101	b	L(lo0)
102
103	ALIGN(16)
104L(top):	ldp	x4, x5, [up,#-16]
105	eon	x10, x10, x13
106	eon	x11, x12, x2
107	stp	x10, x11, [rp,#-16]
108	PSHIFT	x2, x6, cnt
109L(lo2):	NSHIFT	x10, x4, tnc
110	PSHIFT	x13, x5, cnt
111	NSHIFT	x12, x5, tnc
112	ldp	x6, x7, [up,#-32]!
113	eon	x10, x10, x13
114	eon	x11, x12, x2
115	stp	x10, x11, [rp,#-32]!
116	PSHIFT	x2, x4, cnt
117L(lo0):	sub	x18, x18, #1
118L(lo3):	NSHIFT	x10, x6, tnc
119	PSHIFT	x13, x7, cnt
120	NSHIFT	x12, x7, tnc
121	cbnz	x18, L(top)
122
123L(end):	eon	x10, x10, x13
124	eon	x11, x12, x2
125	PSHIFT	x2, x6, cnt
126	stp	x10, x11, [rp,#-16]
127	mvn	x2, x2
128	str	x2, [rp,#-24]
129	ret
130EPILOGUE()
131