xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/mod_1_4.asm (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1dnl  PowerPC-64 mpn_mod_1s_4p
2
3dnl  Copyright 2010, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   cycles/limb
34C POWER3/PPC630          ?
35C POWER4/PPC970          9
36C POWER5                 9
37C POWER6                13
38C POWER7                3.5
39
40C TODO
41C  * Optimise, in particular the cps function.  This was compiler-generated and
42C    then hand optimised.
43
44C INPUT PARAMETERS
45define(`ap',  `r3')
46define(`n',   `r4')
47define(`d',   `r5')
48define(`cps', `r6')
49
50ASM_START()
51
52EXTERN_FUNC(mpn_invert_limb)
53
54PROLOGUE(mpn_mod_1s_4p)
55	std	r23, -72(r1)
56	ld	r23, 48(cps)
57	std	r24, -64(r1)
58	std	r25, -56(r1)
59	ld	r24, 32(cps)
60	ld	r25, 24(cps)
61	std	r26, -48(r1)
62	std	r27, -40(r1)
63	ld	r26, 16(cps)
64	std	r28, -32(r1)
65	std	r29, -24(r1)
66	std	r30, -16(r1)
67	std	r31, -8(r1)
68	ld	r30, 40(cps)
69
70	rldicl.	r0, n, 0,62
71	sldi	r31, n, 3
72	add	ap, ap, r31		C make ap point at end of operand
73
74	cmpdi	cr7, r0, 2
75	beq	cr0, L(b00)
76	blt	cr7, L(b01)
77	beq	cr7, L(b10)
78
79L(b11):	ld	r11, -16(ap)
80	ld	r9, -8(ap)
81	ld	r0, -24(ap)
82	mulhdu	r27, r11, r26
83	mulld	r8, r11, r26
84	mulhdu	r11, r9, r25
85	mulld	r9, r9, r25
86	addc	r31, r8, r0
87	addze	r10, r27
88	addc	r0, r9, r31
89	adde	r9, r11, r10
90	addi	ap, ap, -40
91	b	L(6)
92
93	ALIGN(16)
94L(b00):	ld	r11, -24(ap)
95	ld	r10, -16(ap)
96	ld	r9, -8(ap)
97	ld	r0, -32(ap)
98	mulld	r8, r11, r26
99	mulhdu	r7, r10, r25
100	mulhdu	r27, r11, r26
101	mulhdu	r11, r9, r24
102	mulld	r10, r10, r25
103	mulld	r9, r9, r24
104	addc	r31, r8, r0
105	addze	r0, r27
106	addc	r8, r31, r10
107	adde	r10, r0, r7
108	addc	r0, r9, r8
109	adde	r9, r11, r10
110	addi	ap, ap, -48
111	b	L(6)
112
113	ALIGN(16)
114L(b01):	li	r9, 0
115	ld	r0, -8(ap)
116	addi	ap, ap, -24
117	b	L(6)
118
119	ALIGN(16)
120L(b10):	ld	r9, -8(ap)
121	ld	r0, -16(ap)
122	addi	ap, ap, -32
123
124	ALIGN(16)
125L(6):	addi	r10, n, 3
126	srdi	r7, r10, 2
127	mtctr	r7
128	bdz	L(end)
129
130	ALIGN(16)
131L(top):	ld	r31, -16(ap)
132	ld	r10, -8(ap)
133	ld	r11, 8(ap)
134	ld	r12, 0(ap)
135	mulld	r29, r0, r30		C rl * B4modb
136	mulhdu	r0,  r0, r30		C rl * B4modb
137	mulhdu	r27, r10, r26
138	mulld	r10, r10, r26
139	mulhdu	r7, r9, r23		C rh * B5modb
140	mulld	r9, r9, r23		C rh * B5modb
141	mulhdu	r28, r11, r24
142	mulld	r11, r11, r24
143	mulhdu	r4, r12, r25
144	mulld	r12, r12, r25
145	addc	r8, r10, r31
146	addze	r10, r27
147	addi	ap, ap, -32
148	addc	r27, r8, r12
149	adde	r12, r10, r4
150	addc	r11, r27, r11
151	adde	r31, r12, r28
152	addc	r12, r11, r29
153	adde	r4, r31, r0
154	addc	r0, r9, r12
155	adde	r9, r7, r4
156	bdnz	L(top)
157
158L(end):
159ifdef(`HAVE_LIMB_LITTLE_ENDIAN',
160`	lwz	r3, 8(cps)',
161`	lwz	r3, 12(cps)')
162	mulld	r10, r9, r26
163	mulhdu	r9, r9, r26
164	addc	r11, r0, r10
165	addze	r9, r9
166	ld	r10, 0(cps)
167	subfic	r8, r3, 64
168	sld	r9, r9, r3
169	srd	r8, r11, r8
170	sld	r11, r11, r3
171	or	r9, r8, r9
172	mulld	r0, r9, r10
173	mulhdu	r10, r9, r10
174	addi	r9, r9, 1
175	addc	r8, r0, r11
176	adde	r0, r10, r9
177	mulld	r0, r0, d
178	subf	r0, r0, r11
179	cmpld	cr7, r8, r0
180	bge	cr7, L(9)
181	add	r0, r0, d
182L(9):	cmpld	cr7, r0, d
183	bge-	cr7, L(16)
184L(10):	srd	r3, r0, r3
185	ld	r23, -72(r1)
186	ld	r24, -64(r1)
187	ld	r25, -56(r1)
188	ld	r26, -48(r1)
189	ld	r27, -40(r1)
190	ld	r28, -32(r1)
191	ld	r29, -24(r1)
192	ld	r30, -16(r1)
193	ld	r31, -8(r1)
194	blr
195
196L(16):	subf	r0, d, r0
197	b	L(10)
198EPILOGUE()
199
200PROLOGUE(mpn_mod_1s_4p_cps,toc)
201	mflr	r0
202	std	r29, -24(r1)
203	std	r30, -16(r1)
204	mr	r29, r3
205	std	r0, 16(r1)
206	std	r31, -8(r1)
207	stdu	r1, -144(r1)
208	cntlzd	r31, r4
209	sld	r30, r4, r31
210	mr	r3, r30
211	CALL(	mpn_invert_limb)
212	subfic	r9, r31, 64
213	li	r10, 1
214	sld	r10, r10, r31
215	srd	r9, r3, r9
216	neg	r0, r30
217	or	r10, r10, r9
218	mulld	r10, r10, r0
219	mulhdu	r11, r10, r3
220	nor	r11, r11, r11
221	subf	r11, r10, r11
222	mulld	r11, r11, r30
223	mulld	r0, r10, r3
224	cmpld	cr7, r0, r11
225	bge	cr7, L(18)
226	add	r11, r11, r30
227L(18):	mulhdu	r9, r11, r3
228	add	r9, r11, r9
229	nor	r9, r9, r9
230	mulld	r9, r9, r30
231	mulld	r0, r11, r3
232	cmpld	cr7, r0, r9
233	bge	cr7, L(19)
234	add	r9, r9, r30
235L(19):	mulhdu	r0, r9, r3
236	add	r0, r9, r0
237	nor	r0, r0, r0
238	mulld	r0, r0, r30
239	mulld	r8, r9, r3
240	cmpld	cr7, r8, r0
241	bge	cr7, L(20)
242	add	r0, r0, r30
243L(20):	mulhdu	r8, r0, r3
244	add	r8, r0, r8
245	nor	r8, r8, r8
246	mulld	r8, r8, r30
247	mulld	r7, r0, r3
248	cmpld	cr7, r7, r8
249	bge	cr7, L(21)
250	add	r8, r8, r30
251L(21):	srd	r0, r0, r31
252	addi	r1, r1, 144
253	srd	r8, r8, r31
254	srd	r10, r10, r31
255	srd	r11, r11, r31
256	std	r0, 40(r29)
257	std	r31, 8(r29)
258	srd	r9, r9, r31
259	ld	r0, 16(r1)
260	ld	r30, -16(r1)
261	std	r8, 48(r29)
262	std	r3, 0(r29)
263	mtlr	r0
264	ld	r31, -8(r1)
265	std	r10, 16(r29)
266	std	r11, 24(r29)
267	std	r9, 32(r29)
268	ld	r29, -24(r1)
269	blr
270EPILOGUE()
271