xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/dive_1.asm (revision 122b5006ee1bd67145794b4cde92f4fe4781a5ec)
1dnl  PowerPC-64 mpn_divexact_1 -- mpn by limb exact division.
2
3dnl  Copyright 2006, 2010 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C			cycles/limb
34C			norm	unorm
35C POWER3/PPC630	       13-19
36C POWER4/PPC970		16
37C POWER5		16	16
38C POWER6		37	46
39C POWER7		12	12
40C POWER8		12	12
41
42C TODO
43C  * Check if n=1 code is really an improvement.  It probably isn't.
44C  * Make more similar to mode1o.asm.
45
46C INPUT PARAMETERS
47define(`rp', `r3')
48define(`up', `r4')
49define(`n',  `r5')
50define(`d',  `r6')
51
52
53ASM_START()
54
55EXTERN(binvert_limb_table)
56
57PROLOGUE(mpn_divexact_1,toc)
58	addic.	n, n, -1
59	ld	r12, 0(up)
60	bne	cr0, L(2)
61	divdu	r0, r12, d
62	std	r0, 0(rp)
63	blr
64L(2):
65	rldicl.	r0, d, 0, 63
66	li	r10, 0
67	bne	cr0, L(7)
68	neg	r0, d
69	and	r0, d, r0
70	cntlzd	r0, r0
71	subfic	r0, r0, 63
72	rldicl	r10, r0, 0, 32
73	srd	d, d, r0
74L(7):
75	mtctr	n
76	LEA(	r5, binvert_limb_table)
77	rldicl	r11, d, 63, 57
78	lbzx	r0, r5, r11
79	mulld	r9, r0, r0
80	sldi	r0, r0, 1
81	mulld	r9, d, r9
82	subf	r0, r9, r0
83	mulld	r5, r0, r0
84	sldi	r0, r0, 1
85	mulld	r5, d, r5
86	subf	r0, r5, r0
87	mulld	r9, r0, r0
88	sldi	r0, r0, 1
89	mulld	r9, d, r9
90	subf	r7, r9, r0		C r7 = 1/d mod 2^64
91
92	bne	cr0, L(norm)
93	subfic	r8, r10, 64		C set carry as side effect
94	li	r5, 0
95	srd	r11, r12, r10
96
97	ALIGN(16)
98L(loop0):
99	ld	r12, 8(up)
100	nop
101	addi	up, up, 8
102	sld	r0, r12, r8
103	or	r11, r11, r0
104	subfe	r9, r5, r11
105	srd	r11, r12, r10
106	mulld	r0, r7, r9
107	mulhdu	r5, r0, d
108	std	r0, 0(rp)
109	addi	rp, rp, 8
110	bdnz	L(loop0)
111
112	subfe	r0, r5, r11
113	mulld	r0, r7, r0
114	std	r0, 0(rp)
115	blr
116
117	ALIGN(16)
118L(norm):
119	mulld	r11, r12, r7
120	mulhdu	r5, r11, d
121	std	r11, 0(rp)
122	ALIGN(16)
123L(loop1):
124	ld	r9, 8(up)
125	addi	up, up, 8
126	subfe	r5, r5, r9
127	mulld	r11, r7, r5
128	mulhdu	r5, r11, d	C result not used in last iteration
129	std	r11, 8(rp)
130	addi	rp, rp, 8
131	bdnz	L(loop1)
132
133	blr
134EPILOGUE()
135ASM_END()
136