xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm64/rsh1aors_n.asm (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1*72c7faa4Smrgdnl  ARM64 mpn_rsh1add_n and mpn_rsh1sub_n.
2*72c7faa4Smrg
3*72c7faa4Smrgdnl  Contributed to the GNU project by Torbjörn Granlund.
4*72c7faa4Smrg
5*72c7faa4Smrgdnl  Copyright 2017 Free Software Foundation, Inc.
6*72c7faa4Smrg
7*72c7faa4Smrgdnl  This file is part of the GNU MP Library.
8*72c7faa4Smrgdnl
9*72c7faa4Smrgdnl  The GNU MP Library is free software; you can redistribute it and/or modify
10*72c7faa4Smrgdnl  it under the terms of either:
11*72c7faa4Smrgdnl
12*72c7faa4Smrgdnl    * the GNU Lesser General Public License as published by the Free
13*72c7faa4Smrgdnl      Software Foundation; either version 3 of the License, or (at your
14*72c7faa4Smrgdnl      option) any later version.
15*72c7faa4Smrgdnl
16*72c7faa4Smrgdnl  or
17*72c7faa4Smrgdnl
18*72c7faa4Smrgdnl    * the GNU General Public License as published by the Free Software
19*72c7faa4Smrgdnl      Foundation; either version 2 of the License, or (at your option) any
20*72c7faa4Smrgdnl      later version.
21*72c7faa4Smrgdnl
22*72c7faa4Smrgdnl  or both in parallel, as here.
23*72c7faa4Smrgdnl
24*72c7faa4Smrgdnl  The GNU MP Library is distributed in the hope that it will be useful, but
25*72c7faa4Smrgdnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26*72c7faa4Smrgdnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27*72c7faa4Smrgdnl  for more details.
28*72c7faa4Smrgdnl
29*72c7faa4Smrgdnl  You should have received copies of the GNU General Public License and the
30*72c7faa4Smrgdnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31*72c7faa4Smrgdnl  see https://www.gnu.org/licenses/.
32*72c7faa4Smrg
33*72c7faa4Smrginclude(`../config.m4')
34*72c7faa4Smrg
35*72c7faa4SmrgC	     cycles/limb   assumed optimal c/l
36*72c7faa4SmrgC Cortex-A53	3.25-3.75	 3.0 steady
37*72c7faa4SmrgC Cortex-A57	 2.15		 1.75
38*72c7faa4SmrgC X-Gene	 2.75		 2.5
39*72c7faa4Smrg
40*72c7faa4Smrgchangecom(blah)
41*72c7faa4Smrg
42*72c7faa4Smrgdefine(`rp', `x0')
43*72c7faa4Smrgdefine(`up', `x1')
44*72c7faa4Smrgdefine(`vp', `x2')
45*72c7faa4Smrgdefine(`n',  `x3')
46*72c7faa4Smrg
47*72c7faa4Smrgifdef(`OPERATION_rsh1add_n', `
48*72c7faa4Smrg  define(`ADDSUB',	adds)
49*72c7faa4Smrg  define(`ADDSUBC',	adcs)
50*72c7faa4Smrg  define(`COND',	`cs')
51*72c7faa4Smrg  define(`func_n',	mpn_rsh1add_n)')
52*72c7faa4Smrgifdef(`OPERATION_rsh1sub_n', `
53*72c7faa4Smrg  define(`ADDSUB',	subs)
54*72c7faa4Smrg  define(`ADDSUBC',	sbcs)
55*72c7faa4Smrg  define(`COND',	`cc')
56*72c7faa4Smrg  define(`func_n',	mpn_rsh1sub_n)')
57*72c7faa4Smrg
58*72c7faa4SmrgMULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
59*72c7faa4Smrg
60*72c7faa4SmrgASM_START()
61*72c7faa4SmrgPROLOGUE(func_n)
62*72c7faa4Smrg	lsr	x18, n, #2
63*72c7faa4Smrg
64*72c7faa4Smrg	tbz	n, #0, L(bx0)
65*72c7faa4Smrg
66*72c7faa4SmrgL(bx1):	ldr	x5, [up],#8
67*72c7faa4Smrg	ldr	x9, [vp],#8
68*72c7faa4Smrg	tbnz	n, #1, L(b11)
69*72c7faa4Smrg
70*72c7faa4SmrgL(b01):	ADDSUB	x13, x5, x9
71*72c7faa4Smrg	and	x10, x13, #1
72*72c7faa4Smrg	cbz	x18, L(1)
73*72c7faa4Smrg	ldp	x4, x5, [up],#48
74*72c7faa4Smrg	ldp	x8, x9, [vp],#48
75*72c7faa4Smrg	ADDSUBC	x14, x4, x8
76*72c7faa4Smrg	ADDSUBC	x15, x5, x9
77*72c7faa4Smrg	ldp	x4, x5, [up,#-32]
78*72c7faa4Smrg	ldp	x8, x9, [vp,#-32]
79*72c7faa4Smrg	extr	x17, x14, x13, #1
80*72c7faa4Smrg	ADDSUBC	x12, x4, x8
81*72c7faa4Smrg	ADDSUBC	x13, x5, x9
82*72c7faa4Smrg	str	x17, [rp], #24
83*72c7faa4Smrg	sub	x18, x18, #1
84*72c7faa4Smrg	cbz	x18, L(end)
85*72c7faa4Smrg	b	L(top)
86*72c7faa4Smrg
87*72c7faa4SmrgL(1):	cset	x14, COND
88*72c7faa4Smrg	extr	x17, x14, x13, #1
89*72c7faa4Smrg	str	x17, [rp]
90*72c7faa4Smrg	mov	x0, x10
91*72c7faa4Smrg	ret
92*72c7faa4Smrg
93*72c7faa4SmrgL(b11):	ADDSUB	x15, x5, x9
94*72c7faa4Smrg	and	x10, x15, #1
95*72c7faa4Smrg
96*72c7faa4Smrg	ldp	x4, x5, [up],#32
97*72c7faa4Smrg	ldp	x8, x9, [vp],#32
98*72c7faa4Smrg	ADDSUBC	x12, x4, x8
99*72c7faa4Smrg	ADDSUBC	x13, x5, x9
100*72c7faa4Smrg	cbz	x18, L(3)
101*72c7faa4Smrg	ldp	x4, x5, [up,#-16]
102*72c7faa4Smrg	ldp	x8, x9, [vp,#-16]
103*72c7faa4Smrg	extr	x17, x12, x15, #1
104*72c7faa4Smrg	ADDSUBC	x14, x4, x8
105*72c7faa4Smrg	ADDSUBC	x15, x5, x9
106*72c7faa4Smrg	str	x17, [rp], #8
107*72c7faa4Smrg	b	L(mid)
108*72c7faa4Smrg
109*72c7faa4SmrgL(3):	extr	x17, x12, x15, #1
110*72c7faa4Smrg	str	x17, [rp], #8
111*72c7faa4Smrg	b	L(2)
112*72c7faa4Smrg
113*72c7faa4SmrgL(bx0):	tbz	n, #1, L(b00)
114*72c7faa4Smrg
115*72c7faa4SmrgL(b10):	ldp	x4, x5, [up],#32
116*72c7faa4Smrg	ldp	x8, x9, [vp],#32
117*72c7faa4Smrg	ADDSUB	x12, x4, x8
118*72c7faa4Smrg	ADDSUBC	x13, x5, x9
119*72c7faa4Smrg	and	x10, x12, #1
120*72c7faa4Smrg	cbz	x18, L(2)
121*72c7faa4Smrg	ldp	x4, x5, [up,#-16]
122*72c7faa4Smrg	ldp	x8, x9, [vp,#-16]
123*72c7faa4Smrg	ADDSUBC	x14, x4, x8
124*72c7faa4Smrg	ADDSUBC	x15, x5, x9
125*72c7faa4Smrg	b	L(mid)
126*72c7faa4Smrg
127*72c7faa4SmrgL(b00):	ldp	x4, x5, [up],#48
128*72c7faa4Smrg	ldp	x8, x9, [vp],#48
129*72c7faa4Smrg	ADDSUB	x14, x4, x8
130*72c7faa4Smrg	ADDSUBC	x15, x5, x9
131*72c7faa4Smrg	and	x10, x14, #1
132*72c7faa4Smrg	ldp	x4, x5, [up,#-32]
133*72c7faa4Smrg	ldp	x8, x9, [vp,#-32]
134*72c7faa4Smrg	ADDSUBC	x12, x4, x8
135*72c7faa4Smrg	ADDSUBC	x13, x5, x9
136*72c7faa4Smrg	add	rp, rp, #16
137*72c7faa4Smrg	sub	x18, x18, #1
138*72c7faa4Smrg	cbz	x18, L(end)
139*72c7faa4Smrg
140*72c7faa4Smrg	ALIGN(16)
141*72c7faa4SmrgL(top):	ldp	x4, x5, [up,#-16]
142*72c7faa4Smrg	ldp	x8, x9, [vp,#-16]
143*72c7faa4Smrg	extr	x16, x15, x14, #1
144*72c7faa4Smrg	extr	x17, x12, x15, #1
145*72c7faa4Smrg	ADDSUBC	x14, x4, x8
146*72c7faa4Smrg	ADDSUBC	x15, x5, x9
147*72c7faa4Smrg	stp	x16, x17, [rp,#-16]
148*72c7faa4SmrgL(mid):	ldp	x4, x5, [up],#32
149*72c7faa4Smrg	ldp	x8, x9, [vp],#32
150*72c7faa4Smrg	extr	x16, x13, x12, #1
151*72c7faa4Smrg	extr	x17, x14, x13, #1
152*72c7faa4Smrg	ADDSUBC	x12, x4, x8
153*72c7faa4Smrg	ADDSUBC	x13, x5, x9
154*72c7faa4Smrg	stp	x16, x17, [rp],#32
155*72c7faa4Smrg	sub	x18, x18, #1
156*72c7faa4Smrg	cbnz	x18, L(top)
157*72c7faa4Smrg
158*72c7faa4SmrgL(end):	extr	x16, x15, x14, #1
159*72c7faa4Smrg	extr	x17, x12, x15, #1
160*72c7faa4Smrg	stp	x16, x17, [rp,#-16]
161*72c7faa4SmrgL(2):	cset	x14, COND
162*72c7faa4Smrg	extr	x16, x13, x12, #1
163*72c7faa4Smrg	extr	x17, x14, x13, #1
164*72c7faa4Smrg	stp	x16, x17, [rp]
165*72c7faa4Smrg
166*72c7faa4SmrgL(ret):	mov	x0, x10
167*72c7faa4Smrg	ret
168*72c7faa4SmrgEPILOGUE()
169