xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm/bdiv_q_1.asm (revision 2718af68c3efc72c9769069b5c7f9ed36f6b9def)
1dnl  ARM v4 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- Hensel division by 1-limb divisor.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2012, 2017 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C               cycles/limb
36C               norm   unorm
37C 1176		13	18
38C Cortex-A5	 8	12
39C Cortex-A7	10.5	18
40C Cortex-A8	14	15
41C Cortex-A9	10	12		not measured since latest edits
42C Cortex-A15	 9	 9
43C Cortex-A53	14	20
44
45C Architecture requirements:
46C v5	-
47C v5t	-
48C v5te	-
49C v6	-
50C v6t2	-
51C v7a	-
52
53define(`rp',  `r0')
54define(`up',  `r1')
55define(`n',   `r2')
56define(`d',   `r3')
57define(`di_arg',  `sp[0]')		C	just mpn_pi1_bdiv_q_1
58define(`cnt_arg', `sp[4]')		C	just mpn_pi1_bdiv_q_1
59
60define(`cy',  `r7')
61define(`cnt', `r6')
62define(`tnc', `r8')
63
64ASM_START()
65PROLOGUE(mpn_bdiv_q_1)
66	tst	d, #1
67	push	{r6-r11}
68	mov	cnt, #0
69	bne	L(inv)
70
71C count trailing zeros
72	movs	r10, d, lsl #16
73	moveq	d, d, lsr #16
74	moveq	cnt, #16
75	tst	d, #0xff
76	moveq	d, d, lsr #8
77	addeq	cnt, cnt, #8
78	LEA(	r10, ctz_tab)
79	and	r11, d, #0xff
80	ldrb	r10, [r10, r11]
81	mov	d, d, lsr r10
82	add	cnt, cnt, r10
83
84C binvert limb
85L(inv):	LEA(	r10, binvert_limb_table)
86	and	r12, d, #254
87	ldrb	r10, [r10, r12, lsr #1]
88	mul	r12, r10, r10
89	mul	r12, d, r12
90	rsb	r12, r12, r10, lsl #1
91	mul	r10, r12, r12
92	mul	r10, d, r10
93	rsb	r10, r10, r12, lsl #1	C r10 = inverse
94	b	L(pi1)
95EPILOGUE()
96
97PROLOGUE(mpn_pi1_bdiv_q_1)
98	push	{r6-r11}
99
100	ldr	cnt, [sp, #28]
101	ldr	r10, [sp, #24]
102
103L(pi1):	ldr	r11, [up], #4		C up[0]
104	cmp	cnt, #0
105	mov	cy, #0
106	bne	L(unorm)
107
108L(norm):
109	subs	n, n, #1		C set carry as side-effect
110	beq	L(edn)
111
112	ALIGN(16)
113L(tpn):	sbcs	cy, r11, cy
114	ldr	r11, [up], #4
115	sub	n, n, #1
116	mul	r9, r10, cy
117	tst	n, n
118	umull	r12, cy, d, r9
119	str	r9, [rp], #4
120	bne	L(tpn)
121
122L(edn):	sbc	cy, r11, cy
123	mul	r9, r10, cy
124	str	r9, [rp]
125	pop	{r6-r11}
126	return	r14
127
128L(unorm):
129	rsb	tnc, cnt, #32
130	mov	r11, r11, lsr cnt
131	subs	n, n, #1		C set carry as side-effect
132	beq	L(edu)
133
134	ALIGN(16)
135L(tpu):	ldr	r12, [up], #4
136	orr	r9, r11, r12, lsl tnc
137	mov	r11, r12, lsr cnt
138	sbcs	cy, r9, cy		C critical path ->cy->cy->
139	sub	n, n, #1
140	mul	r9, r10, cy		C critical path ->cy->r9->
141	tst	n, n
142	umull	r12, cy, d, r9		C critical path ->r9->cy->
143	str	r9, [rp], #4
144	bne	L(tpu)
145
146L(edu):	sbc	cy, r11, cy
147	mul	r9, r10, cy
148	str	r9, [rp]
149	pop	{r6-r11}
150	return	r14
151EPILOGUE()
152
153	RODATA
154ctz_tab:
155	.byte	8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
156	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
157	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
158	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
159	.byte	7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
160	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
161	.byte	6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
162	.byte	5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0
163