xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/mod_1_1.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  PowerPC-64 mpn_mod_1_1p
2
3dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   cycles/limb
34C POWER3/PPC630          ?
35C POWER4/PPC970         17
36C POWER5                16
37C POWER6                30
38C POWER7                10.2
39
40C TODO
41C  * Optimise, in particular the cps function.  This was compiler-generated and
42C    then hand optimised.
43
44C INPUT PARAMETERS
45define(`ap',  `r3')
46define(`n',   `r4')
47define(`d',   `r5')
48define(`cps', `r6')
49
50ASM_START()
51
52EXTERN_FUNC(mpn_invert_limb)
53
54PROLOGUE(mpn_mod_1_1p)
55	sldi	r10, r4, 3
56	addi	r4, r4, -1
57	add	r3, r3, r10
58	ld	r0, 16(r6)		C B1modb
59	ld	r12, 24(r6)		C B2modb
60	ld	r9, -8(r3)
61	ld	r10, -16(r3)
62	mtctr	r4
63	mulhdu	r8, r9, r0
64	mulld	r7, r9, r0
65	addc	r11, r7, r10
66	addze	r9, r8
67	bdz	L(end)
68
69	ALIGN(16)
70L(top):	ld	r4, -24(r3)
71	addi	r3, r3, -8
72	nop
73	mulld	r10, r11, r0
74	mulld	r8, r9, r12
75	mulhdu	r11, r11, r0
76	mulhdu	r9, r9, r12
77	addc	r7, r10, r4
78	addze	r10, r11
79	addc	r11, r8, r7
80	adde	r9, r9, r10
81	bdnz	L(top)
82
83L(end):
84ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
85`	lwz	r0, 8(r6)',
86`	lwz	r0, 12(r6)')
87	ld	r3, 0(r6)
88	cmpdi	cr7, r0, 0
89	beq-	cr7, L(4)
90	subfic	r10, r0, 64
91	sld	r9, r9, r0
92	srd	r10, r11, r10
93	or	r9, r10, r9
94L(4):	subfc	r10, r5, r9
95	subfe	r10, r10, r10
96	nand	r10, r10, r10
97	sld	r11, r11, r0
98	and	r10, r10, r5
99	subf	r9, r10, r9
100	mulhdu	r10, r9, r3
101	mulld	r3, r9, r3
102	addi	r9, r9, 1
103	addc	r8, r3, r11
104	adde	r3, r10, r9
105	mulld	r3, r3, r5
106	subf	r3, r3, r11
107	cmpld	cr7, r8, r3
108	bge	cr7, L(5)		C FIXME: Make branch-less
109	add	r3, r3, r5
110L(5):	cmpld	cr7, r3, r5
111	bge-	cr7, L(10)
112	srd	r3, r3, r0
113	blr
114
115L(10):	subf	r3, r5, r3
116	srd	r3, r3, r0
117	blr
118EPILOGUE()
119
120PROLOGUE(mpn_mod_1_1p_cps,toc)
121	mflr	r0
122	std	r29, -24(r1)
123	std	r30, -16(r1)
124	std	r31, -8(r1)
125	cntlzd	r31, r4
126	std	r0, 16(r1)
127	extsw	r31, r31
128	mr	r29, r3
129	stdu	r1, -144(r1)
130	sld	r30, r4, r31
131	mr	r3, r30
132	CALL(	mpn_invert_limb)
133	cmpdi	cr7, r31, 0
134	neg	r0, r30
135	beq-	cr7, L(13)
136	subfic	r11, r31, 64
137	li	r0, 1
138	neg	r9, r30
139	srd	r11, r3, r11
140	sld	r0, r0, r31
141	or	r0, r11, r0
142	mulld	r0, r0, r9
143L(13):	mulhdu	r9, r0, r3
144	mulld	r11, r0, r3
145	add	r9, r0, r9
146	nor	r9, r9, r9
147	mulld	r9, r9, r30
148	cmpld	cr7, r11, r9
149	bge	cr7, L(14)
150	add	r9, r9, r30
151L(14):	addi	r1, r1, 144
152	srd	r0, r0, r31
153	std	r31, 8(r29)
154	std	r3, 0(r29)
155	std	r0, 16(r29)
156	ld	r0, 16(r1)
157	srd	r9, r9, r31
158	ld	r30, -16(r1)
159	ld	r31, -8(r1)
160	std	r9, 24(r29)
161	ld	r29, -24(r1)
162	mtlr	r0
163	blr
164EPILOGUE()
165