xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa64/mul_1.asm (revision d25ffa98a4bfca1fe272f3c182496ec9934faac7)
1dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		    cycles/limb
24C 8000,8200:		6.5
25C 8500,8600,8700:	5.625
26
27C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
28C  could be saved there per call.
29
30C  DESCRIPTION:
31C  The main loop "BIG" is 4-way unrolled, mainly to allow
32C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
33C  registers to the IU registers, have demanded a deep software pipeline, and
34C  a lot of stack slots for partial products in flight.
35C
36C  CODE STRUCTURE:
37C  save-some-registers
38C  do 0, 1, 2, or 3 limbs
39C  if done, restore-some-regs and return
40C  save-many-regs
41C  do 4, 8, ... limb
42C  restore-all-regs
43
44C  STACK LAYOUT:
45C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
46C  slots marked FREE, as well as some slots in the caller's "frame marker".
47C
48C -00 <- r30
49C -08  FREE
50C -10  tmp
51C -18  tmp
52C -20  tmp
53C -28  tmp
54C -30  tmp
55C -38  tmp
56C -40  tmp
57C -48  tmp
58C -50  tmp
59C -58  tmp
60C -60  tmp
61C -68  tmp
62C -70  tmp
63C -78  tmp
64C -80  tmp
65C -88  tmp
66C -90  FREE
67C -98  FREE
68C -a0  FREE
69C -a8  FREE
70C -b0  r13
71C -b8  r12
72C -c0  r11
73C -c8  r10
74C -d0  r8
75C -d8  r8
76C -e0  r7
77C -e8  r6
78C -f0  r5
79C -f8  r4
80C -100 r3
81C  Previous frame:
82C  [unused area]
83C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
84
85
86include(`../config.m4')
87
88C INPUT PARAMETERS:
89define(`rp',`%r26')	C
90define(`up',`%r25')	C
91define(`n',`%r24')	C
92define(`vlimb',`%r23')	C
93
94define(`climb',`%r23')	C
95
96ifdef(`HAVE_ABI_2_0w',
97`	.level	2.0w
98',`	.level	2.0
99')
100PROLOGUE(mpn_mul_1)
101
102ifdef(`HAVE_ABI_2_0w',
103`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
104')
105	std,ma		%r3, 0x100(%r30)
106	std		%r4, -0xf8(%r30)
107	std		%r5, -0xf0(%r30)
108	ldo		0(%r0), climb		C clear climb
109	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
110
111define(`p032a1',`%r1')	C
112define(`p032a2',`%r19')	C
113
114define(`m032',`%r20')	C
115define(`m096',`%r21')	C
116
117define(`p000a',`%r22')	C
118define(`p064a',`%r29')	C
119
120define(`s000',`%r31')	C
121
122define(`ma000',`%r4')	C
123define(`ma064',`%r20')	C
124
125C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
126
127	extrd,u		n, 63, 2, %r5
128	cmpb,=		%r5, %r0, L(BIG)
129	nop
130
131	fldd		0(up), %fr4
132	ldo		8(up), up
133	xmpyu		%fr8R, %fr4L, %fr22
134	xmpyu		%fr8L, %fr4R, %fr23
135	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
136	xmpyu		%fr8R, %fr4R, %fr24
137	xmpyu		%fr8L, %fr4L, %fr25
138	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
139	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
140	addib,<>	-1, %r5, L(two_or_more)
141	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
142LDEF(one)
143	ldd		-0x78(%r30), p032a1
144	ldd		-0x70(%r30), p032a2
145	ldd		-0x80(%r30), p000a
146	b		L(0_one_out)
147	ldd		-0x68(%r30), p064a
148
149LDEF(two_or_more)
150	fldd		0(up), %fr4
151	ldo		8(up), up
152	xmpyu		%fr8R, %fr4L, %fr22
153	xmpyu		%fr8L, %fr4R, %fr23
154	ldd		-0x78(%r30), p032a1
155	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
156	xmpyu		%fr8R, %fr4R, %fr24
157	xmpyu		%fr8L, %fr4L, %fr25
158	ldd		-0x70(%r30), p032a2
159	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
160	ldd		-0x80(%r30), p000a
161	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
162	ldd		-0x68(%r30), p064a
163	addib,<>	-1, %r5, L(three_or_more)
164	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
165LDEF(two)
166	add		p032a1, p032a2, m032
167	add,dc		%r0, %r0, m096
168	depd,z		m032, 31, 32, ma000
169	extrd,u		m032, 31, 32, ma064
170	b		L(0_two_out)
171	depd		m096, 31, 32, ma064
172
173LDEF(three_or_more)
174	fldd		0(up), %fr4
175	add		p032a1, p032a2, m032
176	add,dc		%r0, %r0, m096
177	depd,z		m032, 31, 32, ma000
178	extrd,u		m032, 31, 32, ma064
179C	addib,=		-1, %r5, L(0_out)
180	depd		m096, 31, 32, ma064
181LDEF(loop0)
182C	xmpyu		%fr8R, %fr4L, %fr22
183C	xmpyu		%fr8L, %fr4R, %fr23
184C	ldd		-0x78(%r30), p032a1
185C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
186C
187C	xmpyu		%fr8R, %fr4R, %fr24
188C	xmpyu		%fr8L, %fr4L, %fr25
189C	ldd		-0x70(%r30), p032a2
190C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
191C
192C	ldo		8(rp), rp
193C	add		climb, p000a, s000
194C	ldd		-0x80(%r30), p000a
195C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
196C
197C	add,dc		p064a, %r0, climb
198C	ldo		8(up), up
199C	ldd		-0x68(%r30), p064a
200C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
201C
202C	add		ma000, s000, s000
203C	add,dc		ma064, climb, climb
204C	fldd		0(up), %fr4
205C
206C	std		s000, -8(rp)
207C
208C	add		p032a1, p032a2, m032
209C	add,dc		%r0, %r0, m096
210C
211C	depd,z		m032, 31, 32, ma000
212C	extrd,u		m032, 31, 32, ma064
213C	addib,<>	-1, %r5, L(loop0)
214C	depd		m096, 31, 32, ma064
215LDEF(0_out)
216	ldo		8(up), up
217	xmpyu		%fr8R, %fr4L, %fr22
218	xmpyu		%fr8L, %fr4R, %fr23
219	ldd		-0x78(%r30), p032a1
220	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
221	xmpyu		%fr8R, %fr4R, %fr24
222	xmpyu		%fr8L, %fr4L, %fr25
223	ldd		-0x70(%r30), p032a2
224	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
225	ldo		8(rp), rp
226	add		climb, p000a, s000
227	ldd		-0x80(%r30), p000a
228	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
229	add,dc		p064a, %r0, climb
230	ldd		-0x68(%r30), p064a
231	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
232	add		ma000, s000, s000
233	add,dc		ma064, climb, climb
234	std		s000, -8(rp)
235	add		p032a1, p032a2, m032
236	add,dc		%r0, %r0, m096
237	depd,z		m032, 31, 32, ma000
238	extrd,u		m032, 31, 32, ma064
239	depd		m096, 31, 32, ma064
240LDEF(0_two_out)
241	ldd		-0x78(%r30), p032a1
242	ldd		-0x70(%r30), p032a2
243	ldo		8(rp), rp
244	add		climb, p000a, s000
245	ldd		-0x80(%r30), p000a
246	add,dc		p064a, %r0, climb
247	ldd		-0x68(%r30), p064a
248	add		ma000, s000, s000
249	add,dc		ma064, climb, climb
250	std		s000, -8(rp)
251LDEF(0_one_out)
252	add		p032a1, p032a2, m032
253	add,dc		%r0, %r0, m096
254	depd,z		m032, 31, 32, ma000
255	extrd,u		m032, 31, 32, ma064
256	depd		m096, 31, 32, ma064
257
258	add		climb, p000a, s000
259	add,dc		p064a, %r0, climb
260	add		ma000, s000, s000
261	add,dc		ma064, climb, climb
262	std		s000, 0(rp)
263
264	cmpib,>=	4, n, L(done)
265	ldo		8(rp), rp
266
267C 4-way unrolled code.
268
269LDEF(BIG)
270
271define(`p032a1',`%r1')	C
272define(`p032a2',`%r19')	C
273define(`p096b1',`%r20')	C
274define(`p096b2',`%r21')	C
275define(`p160c1',`%r22')	C
276define(`p160c2',`%r29')	C
277define(`p224d1',`%r31')	C
278define(`p224d2',`%r3')	C
279			C
280define(`m032',`%r4')	C
281define(`m096',`%r5')	C
282define(`m160',`%r6')	C
283define(`m224',`%r7')	C
284define(`m288',`%r8')	C
285			C
286define(`p000a',`%r1')	C
287define(`p064a',`%r19')	C
288define(`p064b',`%r20')	C
289define(`p128b',`%r21')	C
290define(`p128c',`%r22')	C
291define(`p192c',`%r29')	C
292define(`p192d',`%r31')	C
293define(`p256d',`%r3')	C
294			C
295define(`s000',`%r10')	C
296define(`s064',`%r11')	C
297define(`s128',`%r12')	C
298define(`s192',`%r13')	C
299			C
300define(`ma000',`%r9')	C
301define(`ma064',`%r4')	C
302define(`ma128',`%r5')	C
303define(`ma192',`%r6')	C
304define(`ma256',`%r7')	C
305
306	std		%r6, -0xe8(%r30)
307	std		%r7, -0xe0(%r30)
308	std		%r8, -0xd8(%r30)
309	std		%r9, -0xd0(%r30)
310	std		%r10, -0xc8(%r30)
311	std		%r11, -0xc0(%r30)
312	std		%r12, -0xb8(%r30)
313	std		%r13, -0xb0(%r30)
314
315ifdef(`HAVE_ABI_2_0w',
316`	extrd,u		n, 61, 62, n		C right shift 2
317',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
318')
319
320LDEF(4_or_more)
321	fldd		0(up), %fr4
322	fldd		8(up), %fr5
323	fldd		16(up), %fr6
324	fldd		24(up), %fr7
325	xmpyu		%fr8R, %fr4L, %fr22
326	xmpyu		%fr8L, %fr4R, %fr23
327	xmpyu		%fr8R, %fr5L, %fr24
328	xmpyu		%fr8L, %fr5R, %fr25
329	xmpyu		%fr8R, %fr6L, %fr26
330	xmpyu		%fr8L, %fr6R, %fr27
331	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
332	xmpyu		%fr8R, %fr7L, %fr28
333	xmpyu		%fr8L, %fr7R, %fr29
334	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
335	xmpyu		%fr8R, %fr4R, %fr30
336	xmpyu		%fr8L, %fr4L, %fr31
337	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
338	xmpyu		%fr8R, %fr5R, %fr22
339	xmpyu		%fr8L, %fr5L, %fr23
340	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
341	xmpyu		%fr8R, %fr6R, %fr24
342	xmpyu		%fr8L, %fr6L, %fr25
343	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
344	xmpyu		%fr8R, %fr7R, %fr26
345	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
346	addib,<>	-1, n, L(8_or_more)
347	xmpyu		%fr8L, %fr7L, %fr27
348	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
349	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
350	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
351	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
352	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
353	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
354	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
355	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
356	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
357	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
358	ldd		-0x78(%r30), p032a1
359	ldd		-0x70(%r30), p032a2
360	ldd		-0x38(%r30), p096b1
361	ldd		-0x30(%r30), p096b2
362	ldd		-0x58(%r30), p160c1
363	ldd		-0x50(%r30), p160c2
364	ldd		-0x18(%r30), p224d1
365	ldd		-0x10(%r30), p224d2
366	b		L(end1)
367	nop
368
369LDEF(8_or_more)
370	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
371	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
372	ldo		32(up), up
373	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
374	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
375	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
376	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
377	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
378	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
379	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
380	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
381	fldd		0(up), %fr4
382	fldd		8(up), %fr5
383	fldd		16(up), %fr6
384	fldd		24(up), %fr7
385	xmpyu		%fr8R, %fr4L, %fr22
386	ldd		-0x78(%r30), p032a1
387	xmpyu		%fr8L, %fr4R, %fr23
388	xmpyu		%fr8R, %fr5L, %fr24
389	ldd		-0x70(%r30), p032a2
390	xmpyu		%fr8L, %fr5R, %fr25
391	xmpyu		%fr8R, %fr6L, %fr26
392	ldd		-0x38(%r30), p096b1
393	xmpyu		%fr8L, %fr6R, %fr27
394	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
395	xmpyu		%fr8R, %fr7L, %fr28
396	ldd		-0x30(%r30), p096b2
397	xmpyu		%fr8L, %fr7R, %fr29
398	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
399	xmpyu		%fr8R, %fr4R, %fr30
400	ldd		-0x58(%r30), p160c1
401	xmpyu		%fr8L, %fr4L, %fr31
402	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
403	xmpyu		%fr8R, %fr5R, %fr22
404	ldd		-0x50(%r30), p160c2
405	xmpyu		%fr8L, %fr5L, %fr23
406	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
407	xmpyu		%fr8R, %fr6R, %fr24
408	ldd		-0x18(%r30), p224d1
409	xmpyu		%fr8L, %fr6L, %fr25
410	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
411	xmpyu		%fr8R, %fr7R, %fr26
412	ldd		-0x10(%r30), p224d2
413	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
414	addib,=		-1, n, L(end2)
415	xmpyu		%fr8L, %fr7L, %fr27
416LDEF(loop)
417	add		p032a1, p032a2, m032
418	ldd		-0x80(%r30), p000a
419	add,dc		p096b1, p096b2, m096
420	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
421
422	add,dc		p160c1, p160c2, m160
423	ldd		-0x68(%r30), p064a
424	add,dc		p224d1, p224d2, m224
425	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
426
427	add,dc		%r0, %r0, m288
428	ldd		-0x40(%r30), p064b
429	ldo		32(up), up
430	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
431
432	depd,z		m032, 31, 32, ma000
433	ldd		-0x28(%r30), p128b
434	extrd,u		m032, 31, 32, ma064
435	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
436
437	depd		m096, 31, 32, ma064
438	ldd		-0x60(%r30), p128c
439	extrd,u		m096, 31, 32, ma128
440	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
441
442	depd		m160, 31, 32, ma128
443	ldd		-0x48(%r30), p192c
444	extrd,u		m160, 31, 32, ma192
445	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
446
447	depd		m224, 31, 32, ma192
448	ldd		-0x20(%r30), p192d
449	extrd,u		m224, 31, 32, ma256
450	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
451
452	depd		m288, 31, 32, ma256
453	ldd		-0x88(%r30), p256d
454	add		climb, p000a, s000
455	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
456
457	add,dc		p064a, p064b, s064
458	add,dc		p128b, p128c, s128
459	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
460
461	add,dc		p192c, p192d, s192
462	add,dc		p256d, %r0, climb
463	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
464
465	add		ma000, s000, s000	C accum mid 0
466	fldd		0(up), %fr4
467	add,dc		ma064, s064, s064	C accum mid 1
468	std		s000, 0(rp)
469
470	add,dc		ma128, s128, s128	C accum mid 2
471	fldd		8(up), %fr5
472	add,dc		ma192, s192, s192	C accum mid 3
473	std		s064, 8(rp)
474
475	add,dc		ma256, climb, climb
476	fldd		16(up), %fr6
477	std		s128, 16(rp)
478
479	xmpyu		%fr8R, %fr4L, %fr22
480	ldd		-0x78(%r30), p032a1
481	xmpyu		%fr8L, %fr4R, %fr23
482	fldd		24(up), %fr7
483
484	xmpyu		%fr8R, %fr5L, %fr24
485	ldd		-0x70(%r30), p032a2
486	xmpyu		%fr8L, %fr5R, %fr25
487	std		s192, 24(rp)
488
489	xmpyu		%fr8R, %fr6L, %fr26
490	ldd		-0x38(%r30), p096b1
491	xmpyu		%fr8L, %fr6R, %fr27
492	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
493
494	xmpyu		%fr8R, %fr7L, %fr28
495	ldd		-0x30(%r30), p096b2
496	xmpyu		%fr8L, %fr7R, %fr29
497	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
498
499	xmpyu		%fr8R, %fr4R, %fr30
500	ldd		-0x58(%r30), p160c1
501	xmpyu		%fr8L, %fr4L, %fr31
502	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
503
504	xmpyu		%fr8R, %fr5R, %fr22
505	ldd		-0x50(%r30), p160c2
506	xmpyu		%fr8L, %fr5L, %fr23
507	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
508
509	xmpyu		%fr8R, %fr6R, %fr24
510	ldd		-0x18(%r30), p224d1
511	xmpyu		%fr8L, %fr6L, %fr25
512	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
513
514	xmpyu		%fr8R, %fr7R, %fr26
515	ldd		-0x10(%r30), p224d2
516	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
517	xmpyu		%fr8L, %fr7L, %fr27
518
519	addib,<>	-1, n, L(loop)
520	ldo		32(rp), rp
521
522LDEF(end2)
523	add		p032a1, p032a2, m032
524	ldd		-0x80(%r30), p000a
525	add,dc		p096b1, p096b2, m096
526	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
527	add,dc		p160c1, p160c2, m160
528	ldd		-0x68(%r30), p064a
529	add,dc		p224d1, p224d2, m224
530	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
531	add,dc		%r0, %r0, m288
532	ldd		-0x40(%r30), p064b
533	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
534	depd,z		m032, 31, 32, ma000
535	ldd		-0x28(%r30), p128b
536	extrd,u		m032, 31, 32, ma064
537	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
538	depd		m096, 31, 32, ma064
539	ldd		-0x60(%r30), p128c
540	extrd,u		m096, 31, 32, ma128
541	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
542	depd		m160, 31, 32, ma128
543	ldd		-0x48(%r30), p192c
544	extrd,u		m160, 31, 32, ma192
545	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
546	depd		m224, 31, 32, ma192
547	ldd		-0x20(%r30), p192d
548	extrd,u		m224, 31, 32, ma256
549	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
550	depd		m288, 31, 32, ma256
551	ldd		-0x88(%r30), p256d
552	add		climb, p000a, s000
553	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
554	add,dc		p064a, p064b, s064
555	add,dc		p128b, p128c, s128
556	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
557	add,dc		p192c, p192d, s192
558	add,dc		p256d, %r0, climb
559	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
560	add		ma000, s000, s000	C accum mid 0
561	add,dc		ma064, s064, s064	C accum mid 1
562	add,dc		ma128, s128, s128	C accum mid 2
563	add,dc		ma192, s192, s192	C accum mid 3
564	add,dc		ma256, climb, climb
565	std		s000, 0(rp)
566	std		s064, 8(rp)
567	ldd		-0x78(%r30), p032a1
568	std		s128, 16(rp)
569	ldd		-0x70(%r30), p032a2
570	std		s192, 24(rp)
571	ldd		-0x38(%r30), p096b1
572	ldd		-0x30(%r30), p096b2
573	ldd		-0x58(%r30), p160c1
574	ldd		-0x50(%r30), p160c2
575	ldd		-0x18(%r30), p224d1
576	ldd		-0x10(%r30), p224d2
577	ldo		32(rp), rp
578
579LDEF(end1)
580	add		p032a1, p032a2, m032
581	ldd		-0x80(%r30), p000a
582	add,dc		p096b1, p096b2, m096
583	add,dc		p160c1, p160c2, m160
584	ldd		-0x68(%r30), p064a
585	add,dc		p224d1, p224d2, m224
586	add,dc		%r0, %r0, m288
587	ldd		-0x40(%r30), p064b
588	depd,z		m032, 31, 32, ma000
589	ldd		-0x28(%r30), p128b
590	extrd,u		m032, 31, 32, ma064
591	depd		m096, 31, 32, ma064
592	ldd		-0x60(%r30), p128c
593	extrd,u		m096, 31, 32, ma128
594	depd		m160, 31, 32, ma128
595	ldd		-0x48(%r30), p192c
596	extrd,u		m160, 31, 32, ma192
597	depd		m224, 31, 32, ma192
598	ldd		-0x20(%r30), p192d
599	extrd,u		m224, 31, 32, ma256
600	depd		m288, 31, 32, ma256
601	ldd		-0x88(%r30), p256d
602	add		climb, p000a, s000
603	add,dc		p064a, p064b, s064
604	add,dc		p128b, p128c, s128
605	add,dc		p192c, p192d, s192
606	add,dc		p256d, %r0, climb
607	add		ma000, s000, s000	C accum mid 0
608	add,dc		ma064, s064, s064	C accum mid 1
609	add,dc		ma128, s128, s128	C accum mid 2
610	add,dc		ma192, s192, s192	C accum mid 3
611	add,dc		ma256, climb, climb
612	std		s000, 0(rp)
613	std		s064, 8(rp)
614	std		s128, 16(rp)
615	std		s192, 24(rp)
616
617	ldd		-0xb0(%r30), %r13
618	ldd		-0xb8(%r30), %r12
619	ldd		-0xc0(%r30), %r11
620	ldd		-0xc8(%r30), %r10
621	ldd		-0xd0(%r30), %r9
622	ldd		-0xd8(%r30), %r8
623	ldd		-0xe0(%r30), %r7
624	ldd		-0xe8(%r30), %r6
625LDEF(done)
626ifdef(`HAVE_ABI_2_0w',
627`	copy		climb, %r28
628',`	extrd,u		climb, 63, 32, %r29
629	extrd,u		climb, 31, 32, %r28
630')
631	ldd		-0xf0(%r30), %r5
632	ldd		-0xf8(%r30), %r4
633	bve		(%r2)
634	ldd,mb		-0x100(%r30), %r3
635EPILOGUE(mpn_mul_1)
636