xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/mod_1_1.asm (revision aceb213538ec08a74028e213127af18aa17bf1cf)
1dnl  PowerPC-64 mpn_mod_1_1p
2
3dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C                   cycles/limb
23C POWER3/PPC630          ?
24C POWER4/PPC970         17
25C POWER5                16
26C POWER6                30
27C POWER7                10.2
28
29C TODO
30C  * Optimise, in particular the cps function.  This was compiler-generated and
31C    then hand optimised.
32
33C INPUT PARAMETERS
34define(`ap',  `r3')
35define(`n',   `r4')
36define(`d',   `r5')
37define(`cps', `r6')
38
39ASM_START()
40
41EXTERN_FUNC(mpn_invert_limb)
42
43PROLOGUE(mpn_mod_1_1p)
44	sldi	r10, r4, 3
45	addi	r4, r4, -1
46	add	r3, r3, r10
47	ld	r0, 16(r6)		C B1modb
48	ld	r12, 24(r6)		C B2modb
49	ld	r9, -8(r3)
50	ld	r10, -16(r3)
51	mtctr	r4
52	mulhdu	r8, r9, r0
53	mulld	r7, r9, r0
54	addc	r11, r7, r10
55	addze	r9, r8
56	bdz	L(end)
57
58	ALIGN(16)
59L(top):	ld	r4, -24(r3)
60	addi	r3, r3, -8
61	nop
62	mulld	r10, r11, r0
63	mulld	r8, r9, r12
64	mulhdu	r11, r11, r0
65	mulhdu	r9, r9, r12
66	addc	r7, r10, r4
67	addze	r10, r11
68	addc	r11, r8, r7
69	adde	r9, r9, r10
70	bdnz	L(top)
71
72L(end):	lwz	r0, 12(r6)
73	ld	r3, 0(r6)
74	cmpdi	cr7, r0, 0
75	beq-	cr7, L(4)
76	subfic	r10, r0, 64
77	sld	r9, r9, r0
78	srd	r10, r11, r10
79	or	r9, r10, r9
80L(4):	subfc	r10, r5, r9
81	subfe	r10, r10, r10
82	nand	r10, r10, r10
83	sld	r11, r11, r0
84	and	r10, r10, r5
85	subf	r9, r10, r9
86	mulhdu	r10, r9, r3
87	mulld	r3, r9, r3
88	addi	r9, r9, 1
89	addc	r8, r3, r11
90	adde	r3, r10, r9
91	mulld	r3, r3, r5
92	subf	r3, r3, r11
93	cmpld	cr7, r8, r3
94	bge	cr7, L(5)		C FIXME: Make branch-less
95	add	r3, r3, r5
96L(5):	cmpld	cr7, r3, r5
97	bge-	cr7, L(10)
98	srd	r3, r3, r0
99	blr
100
101L(10):	subf	r3, r5, r3
102	srd	r3, r3, r0
103	blr
104EPILOGUE()
105
106PROLOGUE(mpn_mod_1_1p_cps)
107	mflr	r0
108	std	r29, -24(r1)
109	std	r30, -16(r1)
110	std	r31, -8(r1)
111	cntlzd	r31, r4
112	std	r0, 16(r1)
113	extsw	r31, r31
114	mr	r29, r3
115	stdu	r1, -144(r1)
116	sld	r30, r4, r31
117	mr	r3, r30
118	CALL(	mpn_invert_limb)
119	nop
120	cmpdi	cr7, r31, 0
121	neg	r0, r30
122	beq-	cr7, L(13)
123	subfic	r11, r31, 64
124	li	r0, 1
125	neg	r9, r30
126	srd	r11, r3, r11
127	sld	r0, r0, r31
128	or	r0, r11, r0
129	mulld	r0, r0, r9
130L(13):	mulhdu	r9, r0, r3
131	mulld	r11, r0, r3
132	add	r9, r0, r9
133	nor	r9, r9, r9
134	mulld	r9, r9, r30
135	cmpld	cr7, r11, r9
136	bge	cr7, L(14)
137	add	r9, r9, r30
138L(14):	addi	r1, r1, 144
139	srd	r0, r0, r31
140	std	r31, 8(r29)
141	std	r3, 0(r29)
142	std	r0, 16(r29)
143	ld	r0, 16(r1)
144	srd	r9, r9, r31
145	ld	r30, -16(r1)
146	ld	r31, -8(r1)
147	std	r9, 24(r29)
148	ld	r29, -24(r1)
149	mtlr	r0
150	blr
151EPILOGUE()
152