xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm64/lshift.asm (revision 1daf83e636cd998f45e5597a8f995a540e2d5b4a)
1dnl  ARM64 mpn_lshift.
2
3dnl  Copyright 2013, 2014, 2017 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C	     cycles/limb   assumed optimal c/l
34C Cortex-A53	3.5-4.0		 3.25
35C Cortex-A57	 2.0		 2.0
36C X-Gene	 2.67		 2.5
37
38C TODO
39C  * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes.  These
40C    numbers should be 1 and 0, respectively.  The str in wind-down should also
41C    go.
42C  * Using extr and with 63 separate loops we might reach 1.25 c/l on A57.
43C  * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0.
44
45changecom(blah)
46
47define(`rp_arg', `x0')
48define(`up',     `x1')
49define(`n',      `x2')
50define(`cnt',    `x3')
51
52define(`rp',     `x16')
53
54define(`tnc',`x8')
55
56define(`PSHIFT', lsl)
57define(`NSHIFT', lsr)
58
59ASM_START()
60PROLOGUE(mpn_lshift)
61	add	rp, rp_arg, n, lsl #3
62	add	up, up, n, lsl #3
63	sub	tnc, xzr, cnt
64	lsr	x18, n, #2
65	tbz	n, #0, L(bx0)
66
67L(bx1):	ldr	x4, [up,#-8]
68	tbnz	n, #1, L(b11)
69
70L(b01):	NSHIFT	x0, x4, tnc
71	PSHIFT	x2, x4, cnt
72	cbnz	x18, L(gt1)
73	str	x2, [rp,#-8]
74	ret
75L(gt1):	ldp	x4, x5, [up,#-24]
76	sub	up, up, #8
77	add	rp, rp, #16
78	b	L(lo2)
79
80L(b11):	NSHIFT	x0, x4, tnc
81	PSHIFT	x2, x4, cnt
82	ldp	x6, x7, [up,#-24]!
83	b	L(lo3)
84
85L(bx0):	ldp	x4, x5, [up,#-16]
86	tbz	n, #1, L(b00)
87
88L(b10):	NSHIFT	x0, x5, tnc
89	PSHIFT	x13, x5, cnt
90	NSHIFT	x10, x4, tnc
91	PSHIFT	x2, x4, cnt
92	cbnz	x18, L(gt2)
93	orr	x10, x10, x13
94	stp	x2, x10, [rp,#-16]
95	ret
96L(gt2):	ldp	x4, x5, [up,#-32]
97	orr	x10, x10, x13
98	str	x10, [rp,#-8]
99	sub	up, up, #16
100	add	rp, rp, #8
101	b	L(lo2)
102
103L(b00):	NSHIFT	x0, x5, tnc
104	PSHIFT	x13, x5, cnt
105	NSHIFT	x10, x4, tnc
106	PSHIFT	x2, x4, cnt
107	ldp	x6, x7, [up,#-32]!
108	orr	x10, x10, x13
109	str	x10, [rp,#-8]!
110	b	L(lo0)
111
112	ALIGN(16)
113L(top):	ldp	x4, x5, [up,#-16]
114	orr	x10, x10, x13
115	orr	x11, x12, x2
116	stp	x10, x11, [rp,#-16]
117	PSHIFT	x2, x6, cnt
118L(lo2):	NSHIFT	x10, x4, tnc
119	PSHIFT	x13, x5, cnt
120	NSHIFT	x12, x5, tnc
121	ldp	x6, x7, [up,#-32]!
122	orr	x10, x10, x13
123	orr	x11, x12, x2
124	stp	x10, x11, [rp,#-32]!
125	PSHIFT	x2, x4, cnt
126L(lo0):	sub	x18, x18, #1
127L(lo3):	NSHIFT	x10, x6, tnc
128	PSHIFT	x13, x7, cnt
129	NSHIFT	x12, x7, tnc
130	cbnz	x18, L(top)
131
132L(end):	orr	x10, x10, x13
133	orr	x11, x12, x2
134	PSHIFT	x2, x6, cnt
135	stp	x10, x11, [rp,#-16]
136	str	x2, [rp,#-24]
137	ret
138EPILOGUE()
139