xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa64/addmul_1.asm (revision 230b95665bbd3a9d1a53658a36b1053f8382a519)
1dnl  HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
2dnl  add the result to a second limb vector.
3
4dnl  Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		    cycles/limb
24C 8000,8200:		7
25C 8500,8600,8700:	6.375
26
27C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
28C  could be saved there per call.
29
30C  DESCRIPTION:
31C  The main loop "BIG" is 4-way unrolled, mainly to allow
32C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
33C  registers to the IU registers, have demanded a deep software pipeline, and
34C  a lot of stack slots for partial products in flight.
35C
36C  CODE STRUCTURE:
37C  save-some-registers
38C  do 0, 1, 2, or 3 limbs
39C  if done, restore-some-regs and return
40C  save-many-regs
41C  do 4, 8, ... limb
42C  restore-all-regs
43
44C  STACK LAYOUT:
45C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
46C  slots marked FREE, as well as some slots in the caller's "frame marker".
47C
48C -00 <- r30
49C -08  FREE
50C -10  tmp
51C -18  tmp
52C -20  tmp
53C -28  tmp
54C -30  tmp
55C -38  tmp
56C -40  tmp
57C -48  tmp
58C -50  tmp
59C -58  tmp
60C -60  tmp
61C -68  tmp
62C -70  tmp
63C -78  tmp
64C -80  tmp
65C -88  tmp
66C -90  FREE
67C -98  FREE
68C -a0  FREE
69C -a8  FREE
70C -b0  r13
71C -b8  r12
72C -c0  r11
73C -c8  r10
74C -d0  r8
75C -d8  r8
76C -e0  r7
77C -e8  r6
78C -f0  r5
79C -f8  r4
80C -100 r3
81C  Previous frame:
82C  [unused area]
83C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
84
85
86include(`../config.m4')
87
88C INPUT PARAMETERS:
89define(`rp',`%r26')	C
90define(`up',`%r25')	C
91define(`n',`%r24')	C
92define(`vlimb',`%r23')	C
93
94define(`climb',`%r23')	C
95
96ifdef(`HAVE_ABI_2_0w',
97`	.level	2.0w
98',`	.level	2.0
99')
100PROLOGUE(mpn_addmul_1)
101
102ifdef(`HAVE_ABI_2_0w',
103`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
104')
105	std,ma		%r3, 0x100(%r30)
106	std		%r4, -0xf8(%r30)
107	std		%r5, -0xf0(%r30)
108	ldo		0(%r0), climb		C clear climb
109	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
110
111define(`p032a1',`%r1')	C
112define(`p032a2',`%r19')	C
113
114define(`m032',`%r20')	C
115define(`m096',`%r21')	C
116
117define(`p000a',`%r22')	C
118define(`p064a',`%r29')	C
119
120define(`s000',`%r31')	C
121
122define(`ma000',`%r4')	C
123define(`ma064',`%r20')	C
124
125define(`r000',`%r3')	C
126
127	extrd,u		n, 63, 2, %r5
128	cmpb,=		%r5, %r0, L(BIG)
129	nop
130
131	fldd		0(up), %fr4
132	ldo		8(up), up
133	xmpyu		%fr8R, %fr4L, %fr22
134	xmpyu		%fr8L, %fr4R, %fr23
135	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
136	xmpyu		%fr8R, %fr4R, %fr24
137	xmpyu		%fr8L, %fr4L, %fr25
138	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
139	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
140	addib,<>	-1, %r5, L(two_or_more)
141	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
142LDEF(one)
143	ldd		-0x78(%r30), p032a1
144	ldd		-0x70(%r30), p032a2
145	ldd		-0x80(%r30), p000a
146	b		L(0_one_out)
147	ldd		-0x68(%r30), p064a
148
149LDEF(two_or_more)
150	fldd		0(up), %fr4
151	ldo		8(up), up
152	xmpyu		%fr8R, %fr4L, %fr22
153	xmpyu		%fr8L, %fr4R, %fr23
154	ldd		-0x78(%r30), p032a1
155	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
156	xmpyu		%fr8R, %fr4R, %fr24
157	xmpyu		%fr8L, %fr4L, %fr25
158	ldd		-0x70(%r30), p032a2
159	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
160	ldd		-0x80(%r30), p000a
161	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
162	ldd		-0x68(%r30), p064a
163	addib,<>	-1, %r5, L(three_or_more)
164	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
165LDEF(two)
166	add		p032a1, p032a2, m032
167	add,dc		%r0, %r0, m096
168	depd,z		m032, 31, 32, ma000
169	extrd,u		m032, 31, 32, ma064
170	ldd		0(rp), r000
171	b		L(0_two_out)
172	depd		m096, 31, 32, ma064
173
174LDEF(three_or_more)
175	fldd		0(up), %fr4
176	add		p032a1, p032a2, m032
177	add,dc		%r0, %r0, m096
178	depd,z		m032, 31, 32, ma000
179	extrd,u		m032, 31, 32, ma064
180	ldd		0(rp), r000
181C	addib,=		-1, %r5, L(0_out)
182	depd		m096, 31, 32, ma064
183LDEF(loop0)
184C	xmpyu		%fr8R, %fr4L, %fr22
185C	xmpyu		%fr8L, %fr4R, %fr23
186C	ldd		-0x78(%r30), p032a1
187C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
188C
189C	xmpyu		%fr8R, %fr4R, %fr24
190C	xmpyu		%fr8L, %fr4L, %fr25
191C	ldd		-0x70(%r30), p032a2
192C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
193C
194C	ldo		8(rp), rp
195C	add		climb, p000a, s000
196C	ldd		-0x80(%r30), p000a
197C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
198C
199C	add,dc		p064a, %r0, climb
200C	ldo		8(up), up
201C	ldd		-0x68(%r30), p064a
202C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
203C
204C	add		ma000, s000, s000
205C	add,dc		ma064, climb, climb
206C	fldd		0(up), %fr4
207C
208C	add		r000, s000, s000
209C	add,dc		%r0, climb, climb
210C	std		s000, -8(rp)
211C
212C	add		p032a1, p032a2, m032
213C	add,dc		%r0, %r0, m096
214C
215C	depd,z		m032, 31, 32, ma000
216C	extrd,u		m032, 31, 32, ma064
217C	ldd		0(rp), r000
218C	addib,<>	-1, %r5, L(loop0)
219C	depd		m096, 31, 32, ma064
220LDEF(0_out)
221	ldo		8(up), up
222	xmpyu		%fr8R, %fr4L, %fr22
223	xmpyu		%fr8L, %fr4R, %fr23
224	ldd		-0x78(%r30), p032a1
225	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
226	xmpyu		%fr8R, %fr4R, %fr24
227	xmpyu		%fr8L, %fr4L, %fr25
228	ldd		-0x70(%r30), p032a2
229	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
230	ldo		8(rp), rp
231	add		climb, p000a, s000
232	ldd		-0x80(%r30), p000a
233	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
234	add,dc		p064a, %r0, climb
235	ldd		-0x68(%r30), p064a
236	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
237	add		ma000, s000, s000
238	add,dc		ma064, climb, climb
239	add		r000, s000, s000
240	add,dc		%r0, climb, climb
241	std		s000, -8(rp)
242	add		p032a1, p032a2, m032
243	add,dc		%r0, %r0, m096
244	depd,z		m032, 31, 32, ma000
245	extrd,u		m032, 31, 32, ma064
246	ldd		0(rp), r000
247	depd		m096, 31, 32, ma064
248LDEF(0_two_out)
249	ldd		-0x78(%r30), p032a1
250	ldd		-0x70(%r30), p032a2
251	ldo		8(rp), rp
252	add		climb, p000a, s000
253	ldd		-0x80(%r30), p000a
254	add,dc		p064a, %r0, climb
255	ldd		-0x68(%r30), p064a
256	add		ma000, s000, s000
257	add,dc		ma064, climb, climb
258	add		r000, s000, s000
259	add,dc		%r0, climb, climb
260	std		s000, -8(rp)
261LDEF(0_one_out)
262	add		p032a1, p032a2, m032
263	add,dc		%r0, %r0, m096
264	depd,z		m032, 31, 32, ma000
265	extrd,u		m032, 31, 32, ma064
266	ldd		0(rp), r000
267	depd		m096, 31, 32, ma064
268
269	add		climb, p000a, s000
270	add,dc		p064a, %r0, climb
271	add		ma000, s000, s000
272	add,dc		ma064, climb, climb
273	add		r000, s000, s000
274	add,dc		%r0, climb, climb
275	std		s000, 0(rp)
276
277	cmpib,>=	4, n, L(done)
278	ldo		8(rp), rp
279
280C 4-way unrolled code.
281
282LDEF(BIG)
283
284define(`p032a1',`%r1')	C
285define(`p032a2',`%r19')	C
286define(`p096b1',`%r20')	C
287define(`p096b2',`%r21')	C
288define(`p160c1',`%r22')	C
289define(`p160c2',`%r29')	C
290define(`p224d1',`%r31')	C
291define(`p224d2',`%r3')	C
292			C
293define(`m032',`%r4')	C
294define(`m096',`%r5')	C
295define(`m160',`%r6')	C
296define(`m224',`%r7')	C
297define(`m288',`%r8')	C
298			C
299define(`p000a',`%r1')	C
300define(`p064a',`%r19')	C
301define(`p064b',`%r20')	C
302define(`p128b',`%r21')	C
303define(`p128c',`%r22')	C
304define(`p192c',`%r29')	C
305define(`p192d',`%r31')	C
306define(`p256d',`%r3')	C
307			C
308define(`s000',`%r10')	C
309define(`s064',`%r11')	C
310define(`s128',`%r12')	C
311define(`s192',`%r13')	C
312			C
313define(`ma000',`%r9')	C
314define(`ma064',`%r4')	C
315define(`ma128',`%r5')	C
316define(`ma192',`%r6')	C
317define(`ma256',`%r7')	C
318			C
319define(`r000',`%r1')	C
320define(`r064',`%r19')	C
321define(`r128',`%r20')	C
322define(`r192',`%r21')	C
323
324	std		%r6, -0xe8(%r30)
325	std		%r7, -0xe0(%r30)
326	std		%r8, -0xd8(%r30)
327	std		%r9, -0xd0(%r30)
328	std		%r10, -0xc8(%r30)
329	std		%r11, -0xc0(%r30)
330	std		%r12, -0xb8(%r30)
331	std		%r13, -0xb0(%r30)
332
333ifdef(`HAVE_ABI_2_0w',
334`	extrd,u		n, 61, 62, n		C right shift 2
335',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
336')
337
338LDEF(4_or_more)
339	fldd		0(up), %fr4
340	fldd		8(up), %fr5
341	fldd		16(up), %fr6
342	fldd		24(up), %fr7
343	xmpyu		%fr8R, %fr4L, %fr22
344	xmpyu		%fr8L, %fr4R, %fr23
345	xmpyu		%fr8R, %fr5L, %fr24
346	xmpyu		%fr8L, %fr5R, %fr25
347	xmpyu		%fr8R, %fr6L, %fr26
348	xmpyu		%fr8L, %fr6R, %fr27
349	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
350	xmpyu		%fr8R, %fr7L, %fr28
351	xmpyu		%fr8L, %fr7R, %fr29
352	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
353	xmpyu		%fr8R, %fr4R, %fr30
354	xmpyu		%fr8L, %fr4L, %fr31
355	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
356	xmpyu		%fr8R, %fr5R, %fr22
357	xmpyu		%fr8L, %fr5L, %fr23
358	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
359	xmpyu		%fr8R, %fr6R, %fr24
360	xmpyu		%fr8L, %fr6L, %fr25
361	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
362	xmpyu		%fr8R, %fr7R, %fr26
363	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
364	addib,<>	-1, n, L(8_or_more)
365	xmpyu		%fr8L, %fr7L, %fr27
366	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
367	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
368	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
369	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
370	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
371	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
372	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
373	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
374	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
375	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
376	ldd		-0x78(%r30), p032a1
377	ldd		-0x70(%r30), p032a2
378	ldd		-0x38(%r30), p096b1
379	ldd		-0x30(%r30), p096b2
380	ldd		-0x58(%r30), p160c1
381	ldd		-0x50(%r30), p160c2
382	ldd		-0x18(%r30), p224d1
383	ldd		-0x10(%r30), p224d2
384	b		L(end1)
385	nop
386
387LDEF(8_or_more)
388	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
389	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
390	ldo		32(up), up
391	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
392	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
393	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
394	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
395	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
396	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
397	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
398	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
399	fldd		0(up), %fr4
400	fldd		8(up), %fr5
401	fldd		16(up), %fr6
402	fldd		24(up), %fr7
403	xmpyu		%fr8R, %fr4L, %fr22
404	ldd		-0x78(%r30), p032a1
405	xmpyu		%fr8L, %fr4R, %fr23
406	xmpyu		%fr8R, %fr5L, %fr24
407	ldd		-0x70(%r30), p032a2
408	xmpyu		%fr8L, %fr5R, %fr25
409	xmpyu		%fr8R, %fr6L, %fr26
410	ldd		-0x38(%r30), p096b1
411	xmpyu		%fr8L, %fr6R, %fr27
412	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
413	xmpyu		%fr8R, %fr7L, %fr28
414	ldd		-0x30(%r30), p096b2
415	xmpyu		%fr8L, %fr7R, %fr29
416	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
417	xmpyu		%fr8R, %fr4R, %fr30
418	ldd		-0x58(%r30), p160c1
419	xmpyu		%fr8L, %fr4L, %fr31
420	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
421	xmpyu		%fr8R, %fr5R, %fr22
422	ldd		-0x50(%r30), p160c2
423	xmpyu		%fr8L, %fr5L, %fr23
424	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
425	xmpyu		%fr8R, %fr6R, %fr24
426	ldd		-0x18(%r30), p224d1
427	xmpyu		%fr8L, %fr6L, %fr25
428	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
429	xmpyu		%fr8R, %fr7R, %fr26
430	ldd		-0x10(%r30), p224d2
431	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
432	addib,=		-1, n, L(end2)
433	xmpyu		%fr8L, %fr7L, %fr27
434LDEF(loop)
435	add		p032a1, p032a2, m032
436	ldd		-0x80(%r30), p000a
437	add,dc		p096b1, p096b2, m096
438	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
439
440	add,dc		p160c1, p160c2, m160
441	ldd		-0x68(%r30), p064a
442	add,dc		p224d1, p224d2, m224
443	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
444
445	add,dc		%r0, %r0, m288
446	ldd		-0x40(%r30), p064b
447	ldo		32(up), up
448	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
449
450	depd,z		m032, 31, 32, ma000
451	ldd		-0x28(%r30), p128b
452	extrd,u		m032, 31, 32, ma064
453	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
454
455	depd		m096, 31, 32, ma064
456	ldd		-0x60(%r30), p128c
457	extrd,u		m096, 31, 32, ma128
458	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
459
460	depd		m160, 31, 32, ma128
461	ldd		-0x48(%r30), p192c
462	extrd,u		m160, 31, 32, ma192
463	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
464
465	depd		m224, 31, 32, ma192
466	ldd		-0x20(%r30), p192d
467	extrd,u		m224, 31, 32, ma256
468	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
469
470	depd		m288, 31, 32, ma256
471	ldd		-0x88(%r30), p256d
472	add		climb, p000a, s000
473	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
474
475	add,dc		p064a, p064b, s064
476	ldd		0(rp), r000
477	add,dc		p128b, p128c, s128
478	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
479
480	add,dc		p192c, p192d, s192
481	ldd		8(rp), r064
482	add,dc		p256d, %r0, climb
483	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
484
485	ldd		16(rp), r128
486	add		ma000, s000, s000	C accum mid 0
487	ldd		24(rp), r192
488	add,dc		ma064, s064, s064	C accum mid 1
489
490	add,dc		ma128, s128, s128	C accum mid 2
491	fldd		0(up), %fr4
492	add,dc		ma192, s192, s192	C accum mid 3
493	fldd		8(up), %fr5
494
495	add,dc		ma256, climb, climb
496	fldd		16(up), %fr6
497	add		r000, s000, s000	C accum rlimb 0
498	fldd		24(up), %fr7
499
500	add,dc		r064, s064, s064	C accum rlimb 1
501	add,dc		r128, s128, s128	C accum rlimb 2
502	std		s000, 0(rp)
503
504	add,dc		r192, s192, s192	C accum rlimb 3
505	add,dc		%r0, climb, climb
506	std		s064, 8(rp)
507
508	xmpyu		%fr8R, %fr4L, %fr22
509	ldd		-0x78(%r30), p032a1
510	xmpyu		%fr8L, %fr4R, %fr23
511	std		s128, 16(rp)
512
513	xmpyu		%fr8R, %fr5L, %fr24
514	ldd		-0x70(%r30), p032a2
515	xmpyu		%fr8L, %fr5R, %fr25
516	std		s192, 24(rp)
517
518	xmpyu		%fr8R, %fr6L, %fr26
519	ldd		-0x38(%r30), p096b1
520	xmpyu		%fr8L, %fr6R, %fr27
521	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
522
523	xmpyu		%fr8R, %fr7L, %fr28
524	ldd		-0x30(%r30), p096b2
525	xmpyu		%fr8L, %fr7R, %fr29
526	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
527
528	xmpyu		%fr8R, %fr4R, %fr30
529	ldd		-0x58(%r30), p160c1
530	xmpyu		%fr8L, %fr4L, %fr31
531	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
532
533	xmpyu		%fr8R, %fr5R, %fr22
534	ldd		-0x50(%r30), p160c2
535	xmpyu		%fr8L, %fr5L, %fr23
536	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
537
538	xmpyu		%fr8R, %fr6R, %fr24
539	ldd		-0x18(%r30), p224d1
540	xmpyu		%fr8L, %fr6L, %fr25
541	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
542
543	xmpyu		%fr8R, %fr7R, %fr26
544	ldd		-0x10(%r30), p224d2
545	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
546	xmpyu		%fr8L, %fr7L, %fr27
547
548	addib,<>	-1, n, L(loop)
549	ldo		32(rp), rp
550
551LDEF(end2)
552	add		p032a1, p032a2, m032
553	ldd		-0x80(%r30), p000a
554	add,dc		p096b1, p096b2, m096
555	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
556	add,dc		p160c1, p160c2, m160
557	ldd		-0x68(%r30), p064a
558	add,dc		p224d1, p224d2, m224
559	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
560	add,dc		%r0, %r0, m288
561	ldd		-0x40(%r30), p064b
562	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
563	depd,z		m032, 31, 32, ma000
564	ldd		-0x28(%r30), p128b
565	extrd,u		m032, 31, 32, ma064
566	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
567	depd		m096, 31, 32, ma064
568	ldd		-0x60(%r30), p128c
569	extrd,u		m096, 31, 32, ma128
570	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
571	depd		m160, 31, 32, ma128
572	ldd		-0x48(%r30), p192c
573	extrd,u		m160, 31, 32, ma192
574	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
575	depd		m224, 31, 32, ma192
576	ldd		-0x20(%r30), p192d
577	extrd,u		m224, 31, 32, ma256
578	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
579	depd		m288, 31, 32, ma256
580	ldd		-0x88(%r30), p256d
581	add		climb, p000a, s000
582	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
583	add,dc		p064a, p064b, s064
584	ldd		0(rp), r000
585	add,dc		p128b, p128c, s128
586	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
587	add,dc		p192c, p192d, s192
588	ldd		8(rp), r064
589	add,dc		p256d, %r0, climb
590	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
591	ldd		16(rp), r128
592	add		ma000, s000, s000	C accum mid 0
593	ldd		24(rp), r192
594	add,dc		ma064, s064, s064	C accum mid 1
595	add,dc		ma128, s128, s128	C accum mid 2
596	add,dc		ma192, s192, s192	C accum mid 3
597	add,dc		ma256, climb, climb
598	add		r000, s000, s000	C accum rlimb 0
599	add,dc		r064, s064, s064	C accum rlimb 1
600	add,dc		r128, s128, s128	C accum rlimb 2
601	std		s000, 0(rp)
602	add,dc		r192, s192, s192	C accum rlimb 3
603	add,dc		%r0, climb, climb
604	std		s064, 8(rp)
605	ldd		-0x78(%r30), p032a1
606	std		s128, 16(rp)
607	ldd		-0x70(%r30), p032a2
608	std		s192, 24(rp)
609	ldd		-0x38(%r30), p096b1
610	ldd		-0x30(%r30), p096b2
611	ldd		-0x58(%r30), p160c1
612	ldd		-0x50(%r30), p160c2
613	ldd		-0x18(%r30), p224d1
614	ldd		-0x10(%r30), p224d2
615	ldo		32(rp), rp
616
617LDEF(end1)
618	add		p032a1, p032a2, m032
619	ldd		-0x80(%r30), p000a
620	add,dc		p096b1, p096b2, m096
621	add,dc		p160c1, p160c2, m160
622	ldd		-0x68(%r30), p064a
623	add,dc		p224d1, p224d2, m224
624	add,dc		%r0, %r0, m288
625	ldd		-0x40(%r30), p064b
626	depd,z		m032, 31, 32, ma000
627	ldd		-0x28(%r30), p128b
628	extrd,u		m032, 31, 32, ma064
629	depd		m096, 31, 32, ma064
630	ldd		-0x60(%r30), p128c
631	extrd,u		m096, 31, 32, ma128
632	depd		m160, 31, 32, ma128
633	ldd		-0x48(%r30), p192c
634	extrd,u		m160, 31, 32, ma192
635	depd		m224, 31, 32, ma192
636	ldd		-0x20(%r30), p192d
637	extrd,u		m224, 31, 32, ma256
638	depd		m288, 31, 32, ma256
639	ldd		-0x88(%r30), p256d
640	add		climb, p000a, s000
641	add,dc		p064a, p064b, s064
642	ldd		0(rp), r000
643	add,dc		p128b, p128c, s128
644	add,dc		p192c, p192d, s192
645	ldd		8(rp), r064
646	add,dc		p256d, %r0, climb
647	ldd		16(rp), r128
648	add		ma000, s000, s000	C accum mid 0
649	ldd		24(rp), r192
650	add,dc		ma064, s064, s064	C accum mid 1
651	add,dc		ma128, s128, s128	C accum mid 2
652	add,dc		ma192, s192, s192	C accum mid 3
653	add,dc		ma256, climb, climb
654	add		r000, s000, s000	C accum rlimb 0
655	add,dc		r064, s064, s064	C accum rlimb 1
656	add,dc		r128, s128, s128	C accum rlimb 2
657	std		s000, 0(rp)
658	add,dc		r192, s192, s192	C accum rlimb 3
659	add,dc		%r0, climb, climb
660	std		s064, 8(rp)
661	std		s128, 16(rp)
662	std		s192, 24(rp)
663
664	ldd		-0xb0(%r30), %r13
665	ldd		-0xb8(%r30), %r12
666	ldd		-0xc0(%r30), %r11
667	ldd		-0xc8(%r30), %r10
668	ldd		-0xd0(%r30), %r9
669	ldd		-0xd8(%r30), %r8
670	ldd		-0xe0(%r30), %r7
671	ldd		-0xe8(%r30), %r6
672LDEF(done)
673ifdef(`HAVE_ABI_2_0w',
674`	copy		climb, %r28
675',`	extrd,u		climb, 63, 32, %r29
676	extrd,u		climb, 31, 32, %r28
677')
678	ldd		-0xf0(%r30), %r5
679	ldd		-0xf8(%r30), %r4
680	bve		(%r2)
681	ldd,mb		-0x100(%r30), %r3
682EPILOGUE(mpn_addmul_1)
683