xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/sqr_basecase.asm (revision eceb233b9bd0dfebb902ed73b531ae6964fa3f9b)
1dnl  PowerPC-64 mpn_sqr_basecase.
2
3dnl  Contributed to the GNU project by Torbjorn Granlund.
4
5dnl  Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation,
6dnl  Inc.
7
8dnl  This file is part of the GNU MP Library.
9dnl
10dnl  The GNU MP Library is free software; you can redistribute it and/or modify
11dnl  it under the terms of either:
12dnl
13dnl    * the GNU Lesser General Public License as published by the Free
14dnl      Software Foundation; either version 3 of the License, or (at your
15dnl      option) any later version.
16dnl
17dnl  or
18dnl
19dnl    * the GNU General Public License as published by the Free Software
20dnl      Foundation; either version 2 of the License, or (at your option) any
21dnl      later version.
22dnl
23dnl  or both in parallel, as here.
24dnl
25dnl  The GNU MP Library is distributed in the hope that it will be useful, but
26dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
27dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
28dnl  for more details.
29dnl
30dnl  You should have received copies of the GNU General Public License and the
31dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
32dnl  see https://www.gnu.org/licenses/.
33
34include(`../config.m4')
35
36C                  cycles/limb
37C POWER3/PPC630         6-18
38C POWER4/PPC970          8
39C POWER5                 8
40C POWER6                16.25
41C POWER7                 3.77
42
43C NOTES
44C  * This is very crude, cleanup!
45C  * Try to reduce the number of needed live registers.
46C  * Rewrite for POWER6 to use 8 consecutive muls, not 2 groups of 4.  The
47C    cost will be more live registers.
48C  * Rewrite for POWER7 to use addmul_2 building blocks; this will reduce code
49C    size a lot and speed things up perhaps 25%.
50C  * Use computed goto in order to compress the code.
51C  * Implement a larger final corner.
52C  * Schedule callee-saves register saves into other insns.  This could save
53C    about 5 cycles/call.  (We cannot analogously optimise the restores, since
54C    the sqr_diag_addlsh1 loop has no wind-down code as currently written.)
55C  * Should the alternating std/adde sequences be split?  Some pipelines handle
56C    adde poorly, and might sequentialise all these instructions.
57C  * The sqr_diag_addlsh1 loop was written for POWER6 and its preferences for
58C    adjacent integer multiply insns.  Except for the multiply insns, the code
59C    was not carefully optimised for POWER6 or any other CPU.
60C  * Perform cross-jumping in sqr_diag_addlsh1's feed-in code, into the loop.
61
62C INPUT PARAMETERS
63define(`rp', `r3')
64define(`up', `r4')
65define(`n',  `r5')
66
67define(`rp_outer', `r25')
68define(`up_outer', `r21')
69define(`rp_saved', `r22')
70define(`up_saved', `r23')
71define(`n_saved',  `r24')
72
73ASM_START()
74PROLOGUE(mpn_sqr_basecase)
75	cmpdi	cr0, n, 2
76	bge	cr0, L(ge2)
77	ld	r5, 0(up)	C n = 1
78	nop
79	mulld	r8, r5, r5	C weight 0
80	mulhdu	r9, r5, r5	C weight 1
81	std	r8, 0(rp)
82	std	r9, 8(rp)
83	blr
84	ALIGN(16)
85L(ge2):	bgt	cr0, L(gt2)
86	ld	r0, 0(up)	C n = 2
87	nop
88	mulld	r8, r0, r0	C u0 * u0
89	mulhdu	r9, r0, r0	C u0 * u0
90	ld	r6, 8(up)
91	mulld	r10, r6, r6	C u1 * u1
92	mulhdu	r11, r6, r6	C u1 * u1
93	mulld	r4, r6, r0	C u1 * u0
94	mulhdu	r5, r6, r0	C u1 * u0
95	addc	r4, r4, r4
96	adde	r5, r5, r5
97	addze	r11, r11
98	addc	r9, r9, r4
99	adde	r10, r10, r5
100	addze	r11, r11
101	std	r8, 0(rp)
102	std	r9, 8(rp)
103	std	r10, 16(rp)
104	std	r11, 24(rp)
105	blr
106
107	ALIGN(16)
108L(gt2):	std	r31,  -8(r1)
109	std	r30, -16(r1)
110	std	r29, -24(r1)
111	std	r28, -32(r1)
112	std	r27, -40(r1)
113	std	r26, -48(r1)
114	std	r25, -56(r1)
115	std	r24, -64(r1)
116	std	r23, -72(r1)
117	std	r22, -80(r1)
118	std	r21, -88(r1)
119
120	mr	rp_saved, rp
121	mr	up_saved, up
122	mr	n_saved, n
123	mr	rp_outer, rp
124	mr	up_outer, up
125
126	rldicl.	r0, n, 0,62	C r0 = n & 3, set cr0
127	cmpdi	cr6, r0, 2
128	addic	r7, n, 2	C compute count...
129	srdi	r7, r7, 2	C ...for ctr
130	mtctr	r7		C copy count into ctr
131	beq-	cr0, L(b0)
132	blt-	cr6, L(b1)
133	beq-	cr6, L(b2)
134
135L(b3):	ld	r6, 0(up)
136	ld	r9, 8(up)
137	ld	r27, 16(up)
138	addi	up, up, 24
139	li	r12, 0		C carry limb
140	bdz	L(em3)
141
142	ALIGN(16)
143L(tm3):	mulld	r0, r9, r6
144	mulhdu	r26, r9, r6
145	mulld	r7, r27, r6
146	mulhdu	r8, r27, r6
147	ld	r9, 0(up)
148	ld	r27, 8(up)
149	adde	r0, r0, r12
150	adde	r7, r7, r26
151	mulld	r26, r9, r6
152	mulhdu	r10, r9, r6
153	mulld	r11, r27, r6
154	mulhdu	r12, r27, r6
155	ld	r9, 16(up)
156	ld	r27, 24(up)
157	std	r0, 8(rp)
158	adde	r26, r26, r8
159	std	r7, 16(rp)
160	adde	r11, r11, r10
161	std	r26, 24(rp)
162	addi	up, up, 32
163	std	r11, 32(rp)
164	addi	rp, rp, 32
165	bdnz	L(tm3)
166
167L(em3):	mulld	r0, r9, r6
168	mulhdu	r26, r9, r6
169	mulld	r7, r27, r6
170	mulhdu	r8, r27, r6
171	adde	r0, r0, r12
172	adde	r7, r7, r26
173	std	r0, 8(rp)
174	std	r7, 16(rp)
175	addze	r8, r8
176	std	r8, 24(rp)
177	addi	n, n, 2
178	b	L(outer_loop)
179
180L(b0):	ld	r6, 0(up)
181	ld	r27, 8(up)
182	mulld	r7, r27, r6
183	mulhdu	r12, r27, r6
184	std	r7, 8(rp)
185	addi	rp, rp, 8
186	ld	r9, 16(up)
187	ld	r27, 24(up)
188	addi	up, up, 32
189	bdz	L(em0)
190
191	ALIGN(16)
192L(tm0):	mulld	r0, r9, r6
193	mulhdu	r26, r9, r6
194	mulld	r7, r27, r6
195	mulhdu	r8, r27, r6
196	ld	r9, 0(up)
197	ld	r27, 8(up)
198	adde	r0, r0, r12
199	adde	r7, r7, r26
200	mulld	r26, r9, r6
201	mulhdu	r10, r9, r6
202	mulld	r11, r27, r6
203	mulhdu	r12, r27, r6
204	ld	r9, 16(up)
205	ld	r27, 24(up)
206	std	r0, 8(rp)
207	adde	r26, r26, r8
208	std	r7, 16(rp)
209	adde	r11, r11, r10
210	std	r26, 24(rp)
211	addi	up, up, 32
212	std	r11, 32(rp)
213	addi	rp, rp, 32
214	bdnz	L(tm0)
215
216L(em0):	mulld	r0, r9, r6
217	mulhdu	r26, r9, r6
218	mulld	r7, r27, r6
219	mulhdu	r8, r27, r6
220	adde	r0, r0, r12
221	adde	r7, r7, r26
222	std	r0, 8(rp)
223	std	r7, 16(rp)
224	addze	r8, r8
225	std	r8, 24(rp)
226	addi	n, n, 2
227	b	L(outer_loop_ent_2)
228
229L(b1):	ld	r6, 0(up)
230	ld	r9, 8(up)
231	ld	r27, 16(up)
232	mulld	r0, r9, r6
233	mulhdu	r26, r9, r6
234	mulld	r7, r27, r6
235	mulhdu	r12, r27, r6
236	addc	r7, r7, r26
237	std	r0, 8(rp)
238	std	r7, 16(rp)
239	addi	rp, rp, 16
240	ld	r9, 24(up)
241	ld	r27, 32(up)
242	addi	up, up, 40
243	bdz	L(em1)
244
245	ALIGN(16)
246L(tm1):	mulld	r0, r9, r6
247	mulhdu	r26, r9, r6
248	mulld	r7, r27, r6
249	mulhdu	r8, r27, r6
250	ld	r9, 0(up)
251	ld	r27, 8(up)
252	adde	r0, r0, r12
253	adde	r7, r7, r26
254	mulld	r26, r9, r6
255	mulhdu	r10, r9, r6
256	mulld	r11, r27, r6
257	mulhdu	r12, r27, r6
258	ld	r9, 16(up)
259	ld	r27, 24(up)
260	std	r0, 8(rp)
261	adde	r26, r26, r8
262	std	r7, 16(rp)
263	adde	r11, r11, r10
264	std	r26, 24(rp)
265	addi	up, up, 32
266	std	r11, 32(rp)
267	addi	rp, rp, 32
268	bdnz	L(tm1)
269
270L(em1):	mulld	r0, r9, r6
271	mulhdu	r26, r9, r6
272	mulld	r7, r27, r6
273	mulhdu	r8, r27, r6
274	adde	r0, r0, r12
275	adde	r7, r7, r26
276	std	r0, 8(rp)
277	std	r7, 16(rp)
278	addze	r8, r8
279	std	r8, 24(rp)
280	addi	n, n, 2
281	b	L(outer_loop_ent_3)
282
283L(b2):	addi	r7, r7, -1	C FIXME
284	mtctr	r7		C FIXME
285	ld	r6, 0(up)
286	ld	r9, 8(up)
287	ld	r27, 16(up)
288	mulld	r0, r9, r6
289	mulhdu	r26, r9, r6
290	mulld	r7, r27, r6
291	mulhdu	r8, r27, r6
292	ld	r9, 24(up)
293	mulld	r11, r9, r6
294	mulhdu	r10, r9, r6
295	addc	r7, r7, r26
296	adde	r11, r11, r8
297	addze	r12, r10
298	std	r0, 8(rp)
299	std	r7, 16(rp)
300	std	r11, 24(rp)
301	addi	rp, rp, 24
302	ld	r9, 32(up)
303	ld	r27, 40(up)
304	addi	up, up, 48
305	bdz	L(em2)
306
307	ALIGN(16)
308L(tm2):	mulld	r0, r9, r6
309	mulhdu	r26, r9, r6
310	mulld	r7, r27, r6
311	mulhdu	r8, r27, r6
312	ld	r9, 0(up)
313	ld	r27, 8(up)
314	adde	r0, r0, r12
315	adde	r7, r7, r26
316	mulld	r26, r9, r6
317	mulhdu	r10, r9, r6
318	mulld	r11, r27, r6
319	mulhdu	r12, r27, r6
320	ld	r9, 16(up)
321	ld	r27, 24(up)
322	std	r0, 8(rp)
323	adde	r26, r26, r8
324	std	r7, 16(rp)
325	adde	r11, r11, r10
326	std	r26, 24(rp)
327	addi	up, up, 32
328	std	r11, 32(rp)
329	addi	rp, rp, 32
330	bdnz	L(tm2)
331
332L(em2):	mulld	r0, r9, r6
333	mulhdu	r26, r9, r6
334	mulld	r7, r27, r6
335	mulhdu	r8, r27, r6
336	adde	r0, r0, r12
337	adde	r7, r7, r26
338	std	r0, 8(rp)
339	std	r7, 16(rp)
340	addze	r8, r8
341	std	r8, 24(rp)
342	addi	n, n, 2
343	b	L(outer_loop_ent_0)
344
345
346L(outer_loop):
347	addi	n, n, -1
348	addi	up_outer, up_outer, 8
349	addi	rp_outer, rp_outer, 16
350
351	mr	up, up_outer
352	addi	rp, rp_outer, 8
353
354	srdi	r0, n, 2
355	mtctr	r0
356
357	bdz	L(outer_end)
358
359	ld	r6, 0(up)
360	ld	r9, 8(up)
361	ld	r27, 16(up)
362	mulld	r0, r9, r6
363	mulhdu	r26, r9, r6
364	mulld	r7, r27, r6
365	mulhdu	r8, r27, r6
366	ld	r9, 24(up)
367	ld	r28, 0(rp)
368	ld	r29, 8(rp)
369	ld	r30, 16(rp)
370	mulld	r11, r9, r6
371	mulhdu	r10, r9, r6
372	addc	r7, r7, r26
373	adde	r11, r11, r8
374	addze	r12, r10
375	addc	r0, r0, r28
376	std	r0, 0(rp)
377	adde	r7, r7, r29
378	std	r7, 8(rp)
379	adde	r11, r11, r30
380	std	r11, 16(rp)
381	addi	rp, rp, 24
382	ld	r9, 32(up)
383	ld	r27, 40(up)
384	addi	up, up, 48
385	bdz	L(ea1)
386
387	ALIGN(16)
388L(ta1):	mulld	r0, r9, r6
389	mulhdu	r26, r9, r6	C 9
390	mulld	r7, r27, r6
391	mulhdu	r8, r27, r6	C 27
392	ld	r9, 0(up)
393	ld	r28, 0(rp)
394	ld	r27, 8(up)
395	ld	r29, 8(rp)
396	adde	r0, r0, r12	C 0 12
397	adde	r7, r7, r26	C 5 7
398	mulld	r26, r9, r6
399	mulhdu	r10, r9, r6	C 9
400	mulld	r11, r27, r6
401	mulhdu	r12, r27, r6	C 27
402	ld	r9, 16(up)
403	ld	r30, 16(rp)
404	ld	r27, 24(up)
405	ld	r31, 24(rp)
406	adde	r26, r26, r8	C 8 5
407	adde	r11, r11, r10	C 10 11
408	addze	r12, r12	C 12
409	addc	r0, r0, r28	C 0 28
410	std	r0, 0(rp)	C 0
411	adde	r7, r7, r29	C 7 29
412	std	r7, 8(rp)	C 7
413	adde	r26, r26, r30	C 5 30
414	std	r26, 16(rp)	C 5
415	adde	r11, r11, r31	C 11 31
416	std	r11, 24(rp)	C 11
417	addi	up, up, 32
418	addi	rp, rp, 32
419	bdnz	L(ta1)
420
421L(ea1):	mulld	r0, r9, r6
422	mulhdu	r26, r9, r6
423	mulld	r7, r27, r6
424	mulhdu	r8, r27, r6
425	ld	r28, 0(rp)
426	ld	r29, 8(rp)
427	adde	r0, r0, r12
428	adde	r7, r7, r26
429	addze	r8, r8
430	addc	r0, r0, r28
431	std	r0, 0(rp)
432	adde	r7, r7, r29
433	std	r7, 8(rp)
434	addze	r8, r8
435	std	r8, 16(rp)
436
437L(outer_loop_ent_0):
438	addi	n, n, -1
439	addi	up_outer, up_outer, 8
440	addi	rp_outer, rp_outer, 16
441
442	mr	up, up_outer
443	addi	rp, rp_outer, 8
444
445	srdi	r0, n, 2
446	mtctr	r0
447
448	ld	r6, 0(up)
449	ld	r9, 8(up)
450	ld	r27, 16(up)
451	ld	r28, 0(rp)
452	ld	r29, 8(rp)
453	mulld	r0, r9, r6
454	mulhdu	r26, r9, r6
455	mulld	r7, r27, r6
456	mulhdu	r8, r27, r6
457	addc	r0, r0, r28
458	adde	r7, r7, r26
459	addze	r12, r8
460	std	r0, 0(rp)
461	adde	r7, r7, r29
462	std	r7, 8(rp)
463	addi	rp, rp, 16
464	ld	r9, 24(up)
465	ld	r27, 32(up)
466	addi	up, up, 40
467	bdz	L(ea0)
468
469	ALIGN(16)
470L(ta0):	mulld	r0, r9, r6
471	mulhdu	r26, r9, r6	C 9
472	mulld	r7, r27, r6
473	mulhdu	r8, r27, r6	C 27
474	ld	r9, 0(up)
475	ld	r28, 0(rp)
476	ld	r27, 8(up)
477	ld	r29, 8(rp)
478	adde	r0, r0, r12	C 0 12
479	adde	r7, r7, r26	C 5 7
480	mulld	r26, r9, r6
481	mulhdu	r10, r9, r6	C 9
482	mulld	r11, r27, r6
483	mulhdu	r12, r27, r6	C 27
484	ld	r9, 16(up)
485	ld	r30, 16(rp)
486	ld	r27, 24(up)
487	ld	r31, 24(rp)
488	adde	r26, r26, r8	C 8 5
489	adde	r11, r11, r10	C 10 11
490	addze	r12, r12	C 12
491	addc	r0, r0, r28	C 0 28
492	std	r0, 0(rp)	C 0
493	adde	r7, r7, r29	C 7 29
494	std	r7, 8(rp)	C 7
495	adde	r26, r26, r30	C 5 30
496	std	r26, 16(rp)	C 5
497	adde	r11, r11, r31	C 11 31
498	std	r11, 24(rp)	C 11
499	addi	up, up, 32
500	addi	rp, rp, 32
501	bdnz	L(ta0)
502
503L(ea0):	mulld	r0, r9, r6
504	mulhdu	r26, r9, r6
505	mulld	r7, r27, r6
506	mulhdu	r8, r27, r6
507	ld	r28, 0(rp)
508	ld	r29, 8(rp)
509	adde	r0, r0, r12
510	adde	r7, r7, r26
511	addze	r8, r8
512	addc	r0, r0, r28
513	std	r0, 0(rp)
514	adde	r7, r7, r29
515	std	r7, 8(rp)
516	addze	r8, r8
517	std	r8, 16(rp)
518
519L(outer_loop_ent_3):
520	addi	n, n, -1
521	addi	up_outer, up_outer, 8
522	addi	rp_outer, rp_outer, 16
523
524	mr	up, up_outer
525	addi	rp, rp_outer, 8
526
527	srdi	r0, n, 2
528	mtctr	r0
529
530	ld	r6, 0(up)
531	ld	r9, 8(up)
532	ld	r28, 0(rp)
533	mulld	r0, r9, r6
534	mulhdu	r12, r9, r6
535	addc	r0, r0, r28
536	std	r0, 0(rp)
537	addi	rp, rp, 8
538	ld	r9, 16(up)
539	ld	r27, 24(up)
540	addi	up, up, 32
541	bdz	L(ea3)
542
543	ALIGN(16)
544L(ta3):	mulld	r0, r9, r6
545	mulhdu	r26, r9, r6	C 9
546	mulld	r7, r27, r6
547	mulhdu	r8, r27, r6	C 27
548	ld	r9, 0(up)
549	ld	r28, 0(rp)
550	ld	r27, 8(up)
551	ld	r29, 8(rp)
552	adde	r0, r0, r12	C 0 12
553	adde	r7, r7, r26	C 5 7
554	mulld	r26, r9, r6
555	mulhdu	r10, r9, r6	C 9
556	mulld	r11, r27, r6
557	mulhdu	r12, r27, r6	C 27
558	ld	r9, 16(up)
559	ld	r30, 16(rp)
560	ld	r27, 24(up)
561	ld	r31, 24(rp)
562	adde	r26, r26, r8	C 8 5
563	adde	r11, r11, r10	C 10 11
564	addze	r12, r12	C 12
565	addc	r0, r0, r28	C 0 28
566	std	r0, 0(rp)	C 0
567	adde	r7, r7, r29	C 7 29
568	std	r7, 8(rp)	C 7
569	adde	r26, r26, r30	C 5 30
570	std	r26, 16(rp)	C 5
571	adde	r11, r11, r31	C 11 31
572	std	r11, 24(rp)	C 11
573	addi	up, up, 32
574	addi	rp, rp, 32
575	bdnz	L(ta3)
576
577L(ea3):	mulld	r0, r9, r6
578	mulhdu	r26, r9, r6
579	mulld	r7, r27, r6
580	mulhdu	r8, r27, r6
581	ld	r28, 0(rp)
582	ld	r29, 8(rp)
583	adde	r0, r0, r12
584	adde	r7, r7, r26
585	addze	r8, r8
586	addc	r0, r0, r28
587	std	r0, 0(rp)
588	adde	r7, r7, r29
589	std	r7, 8(rp)
590	addze	r8, r8
591	std	r8, 16(rp)
592
593
594L(outer_loop_ent_2):
595	addi	n, n, -1
596	addi	up_outer, up_outer, 8
597	addi	rp_outer, rp_outer, 16
598
599	mr	up, up_outer
600	addi	rp, rp_outer, 8
601
602	srdi	r0, n, 2
603	mtctr	r0
604
605	addic	r0, r0, 0
606	li	r12, 0		C cy_limb = 0
607	ld	r6, 0(up)
608	ld	r9, 8(up)
609	ld	r27, 16(up)
610	bdz	L(ea2)
611	addi	up, up, 24
612
613	ALIGN(16)
614L(ta2):	mulld	r0, r9, r6
615	mulhdu	r26, r9, r6	C 9
616	mulld	r7, r27, r6
617	mulhdu	r8, r27, r6	C 27
618	ld	r9, 0(up)
619	ld	r28, 0(rp)
620	ld	r27, 8(up)
621	ld	r29, 8(rp)
622	adde	r0, r0, r12	C 0 12
623	adde	r7, r7, r26	C 5 7
624	mulld	r26, r9, r6
625	mulhdu	r10, r9, r6	C 9
626	mulld	r11, r27, r6
627	mulhdu	r12, r27, r6	C 27
628	ld	r9, 16(up)
629	ld	r30, 16(rp)
630	ld	r27, 24(up)
631	ld	r31, 24(rp)
632	adde	r26, r26, r8	C 8 5
633	adde	r11, r11, r10	C 10 11
634	addze	r12, r12	C 12
635	addc	r0, r0, r28	C 0 28
636	std	r0, 0(rp)	C 0
637	adde	r7, r7, r29	C 7 29
638	std	r7, 8(rp)	C 7
639	adde	r26, r26, r30	C 5 30
640	std	r26, 16(rp)	C 5
641	adde	r11, r11, r31	C 11 31
642	std	r11, 24(rp)	C 11
643	addi	up, up, 32
644	addi	rp, rp, 32
645	bdnz	L(ta2)
646
647L(ea2):	mulld	r0, r9, r6
648	mulhdu	r26, r9, r6
649	mulld	r7, r27, r6
650	mulhdu	r8, r27, r6
651	ld	r28, 0(rp)
652	ld	r29, 8(rp)
653	adde	r0, r0, r12
654	adde	r7, r7, r26
655	addze	r8, r8
656	addc	r0, r0, r28
657	std	r0, 0(rp)
658	adde	r7, r7, r29
659	std	r7, 8(rp)
660	addze	r8, r8
661	std	r8, 16(rp)
662
663	b	L(outer_loop)
664
665L(outer_end):
666	ld	r6, 0(up)
667	ld	r9, 8(up)
668	ld	r11, 0(rp)
669	mulld	r0, r9, r6
670	mulhdu	r8, r9, r6
671	addc	r0, r0, r11
672	std	r0, 0(rp)
673	addze	r8, r8
674	std	r8, 8(rp)
675
676define(`rp',  `rp_saved')
677define(`up',  `r5')
678define(`n',   `r6')
679define(`climb',	`r0')
680
681	addi	r4, rp_saved, 8
682	mr	r5, up_saved
683	mr	r6, n_saved
684
685	rldicl.	r0, n, 0,62		C r0 = n & 3, set cr0
686	cmpdi	cr6, r0, 2
687	addi	n, n, 2			C compute count...
688	srdi	n, n, 2			C ...for ctr
689	mtctr	n			C put loop count into ctr
690	beq	cr0, L(xb0)
691	blt	cr6, L(xb1)
692	beq	cr6, L(xb2)
693
694L(xb3):	ld	r6,   0(up)
695	ld	r7,   8(up)
696	ld	r12, 16(up)
697	addi	up, up, 24
698	mulld	r24, r6, r6
699	mulhdu	r25, r6, r6
700	mulld	r26, r7, r7
701	mulhdu	r27, r7, r7
702	mulld	r28, r12, r12
703	mulhdu	r29, r12, r12
704	ld	r10,  8(rp)
705	ld	r11, 16(rp)
706	ld	r6,  24(rp)
707	ld	r7,  32(rp)
708	addc	r10, r10, r10
709	adde	r11, r11, r11
710	adde	r6, r6, r6
711	adde	r7, r7, r7
712	addze	climb, r29
713	addc	r10, r10, r25
714	adde	r11, r11, r26
715	adde	r6, r6, r27
716	adde	r7, r7, r28
717	std	r24,  0(rp)
718	std	r10,  8(rp)
719	std	r11, 16(rp)
720	std	r6,  24(rp)
721	std	r7,  32(rp)
722	addi	rp, rp, 40
723	bdnz	L(top)
724	b	L(end)
725
726L(xb2):	ld	r6,  0(up)
727	ld	r7,  8(up)
728	addi	up, up, 16
729	mulld	r24, r6, r6
730	mulhdu	r25, r6, r6
731	mulld	r26, r7, r7
732	mulhdu	r27, r7, r7
733	ld	r10,  8(rp)
734	ld	r11, 16(rp)
735	addc	r10, r10, r10
736	adde	r11, r11, r11
737	addze	climb, r27
738	addc	r10, r10, r25
739	adde	r11, r11, r26
740	std	r24,  0(rp)
741	std	r10,  8(rp)
742	std	r11, 16(rp)
743	addi	rp, rp, 24
744	bdnz	L(top)
745	b	L(end)
746
747L(xb0):	ld	r6,   0(up)
748	ld	r7,   8(up)
749	ld	r12, 16(up)
750	ld	r23, 24(up)
751	addi	up, up, 32
752	mulld	r24, r6, r6
753	mulhdu	r25, r6, r6
754	mulld	r26, r7, r7
755	mulhdu	r27, r7, r7
756	mulld	r28, r12, r12
757	mulhdu	r29, r12, r12
758	mulld	r30, r23, r23
759	mulhdu	r31, r23, r23
760	ld	r10,  8(rp)
761	ld	r11, 16(rp)
762	ld	r6,  24(rp)
763	ld	r7,  32(rp)
764	ld	r12, 40(rp)
765	ld	r23, 48(rp)
766	addc	r10, r10, r10
767	adde	r11, r11, r11
768	adde	r6, r6, r6
769	adde	r7, r7, r7
770	adde	r12, r12, r12
771	adde	r23, r23, r23
772	addze	climb, r31
773	std	r24,  0(rp)
774	addc	r10, r10, r25
775	std	r10,  8(rp)
776	adde	r11, r11, r26
777	std	r11, 16(rp)
778	adde	r6, r6, r27
779	std	r6,  24(rp)
780	adde	r7, r7, r28
781	std	r7,  32(rp)
782	adde	r12, r12, r29
783	std	r12, 40(rp)
784	adde	r23, r23, r30
785	std	r23, 48(rp)
786	addi	rp, rp, 56
787	bdnz	L(top)
788	b	L(end)
789
790L(xb1):	ld	r6,  0(up)
791	addi	up, up, 8
792	mulld	r24, r6, r6
793	mulhdu	climb, r6, r6
794	std	r24, 0(rp)
795	addic	rp, rp, 8		C clear carry as side-effect
796
797	ALIGN(32)
798L(top):	ld	r6,   0(up)
799	ld	r7,   8(up)
800	ld	r12, 16(up)
801	ld	r23, 24(up)
802	addi	up, up, 32
803	mulld	r24, r6, r6
804	mulhdu	r25, r6, r6
805	mulld	r26, r7, r7
806	mulhdu	r27, r7, r7
807	mulld	r28, r12, r12
808	mulhdu	r29, r12, r12
809	mulld	r30, r23, r23
810	mulhdu	r31, r23, r23
811	ld	r8,   0(rp)
812	ld	r9,   8(rp)
813	adde	r8, r8, r8
814	adde	r9, r9, r9
815	ld	r10, 16(rp)
816	ld	r11, 24(rp)
817	adde	r10, r10, r10
818	adde	r11, r11, r11
819	ld	r6,  32(rp)
820	ld	r7,  40(rp)
821	adde	r6, r6, r6
822	adde	r7, r7, r7
823	ld	r12, 48(rp)
824	ld	r23, 56(rp)
825	adde	r12, r12, r12
826	adde	r23, r23, r23
827	addze	r31, r31
828	addc	r8, r8, climb
829	std	r8,   0(rp)
830	adde	r9, r9, r24
831	std	r9,   8(rp)
832	adde	r10, r10, r25
833	std	r10, 16(rp)
834	adde	r11, r11, r26
835	std	r11, 24(rp)
836	adde	r6, r6, r27
837	std	r6,  32(rp)
838	adde	r7, r7, r28
839	std	r7,  40(rp)
840	adde	r12, r12, r29
841	std	r12, 48(rp)
842	adde	r23, r23, r30
843	std	r23, 56(rp)
844	mr	climb, r31
845	addi	rp, rp, 64
846	bdnz	L(top)
847
848L(end):	addze	climb, climb
849	std	climb,  0(rp)
850
851	ld	r31,  -8(r1)
852	ld	r30, -16(r1)
853	ld	r29, -24(r1)
854	ld	r28, -32(r1)
855	ld	r27, -40(r1)
856	ld	r26, -48(r1)
857	ld	r25, -56(r1)
858	ld	r24, -64(r1)
859	ld	r23, -72(r1)
860	ld	r22, -80(r1)
861	ld	r21, -88(r1)
862	blr
863EPILOGUE()
864