xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/mod_1_4.asm (revision d11b170b9000ada93db553723522a63d5deac310)
1dnl  PowerPC-64 mpn_mod_1s_4p
2
3dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of the GNU Lesser General Public License as published
9dnl  by the Free Software Foundation; either version 3 of the License, or (at
10dnl  your option) any later version.
11
12dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15dnl  License for more details.
16
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22C                   cycles/limb
23C POWER3/PPC630          ?
24C POWER4/PPC970          9
25C POWER5                 9
26C POWER6                13
27C POWER7                3.5
28
29C TODO
30C  * Optimise, in particular the cps function.  This was compiler-generated and
31C    then hand optimised.
32
33C INPUT PARAMETERS
34define(`ap',  `r3')
35define(`n',   `r4')
36define(`d',   `r5')
37define(`cps', `r6')
38
39ASM_START()
40
41EXTERN_FUNC(mpn_invert_limb)
42
43PROLOGUE(mpn_mod_1s_4p)
44	std	r23, -72(r1)
45	ld	r23, 48(cps)
46	std	r24, -64(r1)
47	std	r25, -56(r1)
48	ld	r24, 32(cps)
49	ld	r25, 24(cps)
50	std	r26, -48(r1)
51	std	r27, -40(r1)
52	ld	r26, 16(cps)
53	std	r28, -32(r1)
54	std	r29, -24(r1)
55	std	r30, -16(r1)
56	std	r31, -8(r1)
57	ld	r30, 40(cps)
58
59	rldicl.	r0, n, 0,62
60	sldi	r31, n, 3
61	add	ap, ap, r31		C make ap point at end of operand
62
63	cmpdi	cr7, r0, 2
64	beq	cr0, L(b00)
65	blt	cr7, L(b01)
66	beq	cr7, L(b10)
67
68L(b11):	ld	r11, -16(ap)
69	ld	r9, -8(ap)
70	ld	r0, -24(ap)
71	mulhdu	r27, r11, r26
72	mulld	r8, r11, r26
73	mulhdu	r11, r9, r25
74	mulld	r9, r9, r25
75	addc	r31, r8, r0
76	addze	r10, r27
77	addc	r0, r9, r31
78	adde	r9, r11, r10
79	addi	ap, ap, -40
80	b	L(6)
81
82	ALIGN(16)
83L(b00):	ld	r11, -24(ap)
84	ld	r10, -16(ap)
85	ld	r9, -8(ap)
86	ld	r0, -32(ap)
87	mulld	r8, r11, r26
88	mulhdu	r7, r10, r25
89	mulhdu	r27, r11, r26
90	mulhdu	r11, r9, r24
91	mulld	r10, r10, r25
92	mulld	r9, r9, r24
93	addc	r31, r8, r0
94	addze	r0, r27
95	addc	r8, r31, r10
96	adde	r10, r0, r7
97	addc	r0, r9, r8
98	adde	r9, r11, r10
99	addi	ap, ap, -48
100	b	L(6)
101
102	ALIGN(16)
103L(b01):	li	r9, 0
104	ld	r0, -8(ap)
105	addi	ap, ap, -24
106	b	L(6)
107
108	ALIGN(16)
109L(b10):	ld	r9, -8(ap)
110	ld	r0, -16(ap)
111	addi	ap, ap, -32
112
113	ALIGN(16)
114L(6):	addi	r10, n, 3
115	srdi	r7, r10, 2
116	mtctr	r7
117	bdz	L(end)
118
119	ALIGN(16)
120L(top):	ld	r31, -16(ap)
121	ld	r10, -8(ap)
122	ld	r11, 8(ap)
123	ld	r12, 0(ap)
124	mulld	r29, r0, r30		C rl * B4modb
125	mulhdu	r0,  r0, r30		C rl * B4modb
126	mulhdu	r27, r10, r26
127	mulld	r10, r10, r26
128	mulhdu	r7, r9, r23		C rh * B5modb
129	mulld	r9, r9, r23		C rh * B5modb
130	mulhdu	r28, r11, r24
131	mulld	r11, r11, r24
132	mulhdu	r4, r12, r25
133	mulld	r12, r12, r25
134	addc	r8, r10, r31
135	addze	r10, r27
136	addi	ap, ap, -32
137	addc	r27, r8, r12
138	adde	r12, r10, r4
139	addc	r11, r27, r11
140	adde	r31, r12, r28
141	addc	r12, r11, r29
142	adde	r4, r31, r0
143	addc	r0, r9, r12
144	adde	r9, r7, r4
145	bdnz	L(top)
146
147L(end):	lwz	r3, 12(cps)
148	mulld	r10, r9, r26
149	mulhdu	r9, r9, r26
150	addc	r11, r0, r10
151	addze	r9, r9
152	ld	r10, 0(cps)
153	subfic	r8, r3, 64
154	sld	r9, r9, r3
155	srd	r8, r11, r8
156	sld	r11, r11, r3
157	or	r9, r8, r9
158	mulld	r0, r9, r10
159	mulhdu	r10, r9, r10
160	addi	r9, r9, 1
161	addc	r8, r0, r11
162	adde	r0, r10, r9
163	mulld	r0, r0, d
164	subf	r0, r0, r11
165	cmpld	cr7, r8, r0
166	bge	cr7, L(9)
167	add	r0, r0, d
168L(9):	cmpld	cr7, r0, d
169	bge-	cr7, L(16)
170L(10):	srd	r3, r0, r3
171	ld	r23, -72(r1)
172	ld	r24, -64(r1)
173	ld	r25, -56(r1)
174	ld	r26, -48(r1)
175	ld	r27, -40(r1)
176	ld	r28, -32(r1)
177	ld	r29, -24(r1)
178	ld	r30, -16(r1)
179	ld	r31, -8(r1)
180	blr
181
182L(16):	subf	r0, d, r0
183	b	L(10)
184EPILOGUE()
185
186PROLOGUE(mpn_mod_1s_4p_cps)
187	mflr	r0
188	std	r29, -24(r1)
189	std	r30, -16(r1)
190	mr	r29, r3
191	std	r0, 16(r1)
192	std	r31, -8(r1)
193	stdu	r1, -144(r1)
194	cntlzd	r31, r4
195	sld	r30, r4, r31
196	mr	r3, r30
197	CALL(	mpn_invert_limb)
198	nop
199	subfic	r9, r31, 64
200	li	r10, 1
201	sld	r10, r10, r31
202	srd	r9, r3, r9
203	neg	r0, r30
204	or	r10, r10, r9
205	mulld	r10, r10, r0
206	mulhdu	r11, r10, r3
207	nor	r11, r11, r11
208	subf	r11, r10, r11
209	mulld	r11, r11, r30
210	mulld	r0, r10, r3
211	cmpld	cr7, r0, r11
212	bge	cr7, L(18)
213	add	r11, r11, r30
214L(18):	mulhdu	r9, r11, r3
215	add	r9, r11, r9
216	nor	r9, r9, r9
217	mulld	r9, r9, r30
218	mulld	r0, r11, r3
219	cmpld	cr7, r0, r9
220	bge	cr7, L(19)
221	add	r9, r9, r30
222L(19):	mulhdu	r0, r9, r3
223	add	r0, r9, r0
224	nor	r0, r0, r0
225	mulld	r0, r0, r30
226	mulld	r8, r9, r3
227	cmpld	cr7, r8, r0
228	bge	cr7, L(20)
229	add	r0, r0, r30
230L(20):	mulhdu	r8, r0, r3
231	add	r8, r0, r8
232	nor	r8, r8, r8
233	mulld	r8, r8, r30
234	mulld	r7, r0, r3
235	cmpld	cr7, r7, r8
236	bge	cr7, L(21)
237	add	r8, r8, r30
238L(21):	srd	r0, r0, r31
239	addi	r1, r1, 144
240	srd	r8, r8, r31
241	srd	r10, r10, r31
242	srd	r11, r11, r31
243	std	r0, 40(r29)
244	std	r31, 8(r29)
245	srd	r9, r9, r31
246	ld	r0, 16(r1)
247	ld	r30, -16(r1)
248	std	r8, 48(r29)
249	std	r3, 0(r29)
250	mtlr	r0
251	ld	r31, -8(r1)
252	std	r10, 16(r29)
253	std	r11, 24(r29)
254	std	r9, 32(r29)
255	ld	r29, -24(r1)
256	blr
257EPILOGUE()
258