xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa64/addmul_1.asm (revision fc4f42693f9b1c31f39f9cf50af1bf2010325808)
1dnl  HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
2dnl  add the result to a second limb vector.
3
4dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C		    cycles/limb
35C 8000,8200:		7
36C 8500,8600,8700:	6.375
37
38C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
39C  could be saved there per call.
40
41C  DESCRIPTION:
42C  The main loop "BIG" is 4-way unrolled, mainly to allow
43C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
44C  registers to the IU registers, have demanded a deep software pipeline, and
45C  a lot of stack slots for partial products in flight.
46C
47C  CODE STRUCTURE:
48C  save-some-registers
49C  do 0, 1, 2, or 3 limbs
50C  if done, restore-some-regs and return
51C  save-many-regs
52C  do 4, 8, ... limb
53C  restore-all-regs
54
55C  STACK LAYOUT:
56C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
57C  slots marked FREE, as well as some slots in the caller's "frame marker".
58C
59C -00 <- r30
60C -08  FREE
61C -10  tmp
62C -18  tmp
63C -20  tmp
64C -28  tmp
65C -30  tmp
66C -38  tmp
67C -40  tmp
68C -48  tmp
69C -50  tmp
70C -58  tmp
71C -60  tmp
72C -68  tmp
73C -70  tmp
74C -78  tmp
75C -80  tmp
76C -88  tmp
77C -90  FREE
78C -98  FREE
79C -a0  FREE
80C -a8  FREE
81C -b0  r13
82C -b8  r12
83C -c0  r11
84C -c8  r10
85C -d0  r8
86C -d8  r8
87C -e0  r7
88C -e8  r6
89C -f0  r5
90C -f8  r4
91C -100 r3
92C  Previous frame:
93C  [unused area]
94C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
95
96
97include(`../config.m4')
98
99C INPUT PARAMETERS:
100define(`rp',`%r26')	C
101define(`up',`%r25')	C
102define(`n',`%r24')	C
103define(`vlimb',`%r23')	C
104
105define(`climb',`%r23')	C
106
107ifdef(`HAVE_ABI_2_0w',
108`	.level	2.0w
109',`	.level	2.0
110')
111PROLOGUE(mpn_addmul_1)
112
113ifdef(`HAVE_ABI_2_0w',
114`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
115')
116	std,ma		%r3, 0x100(%r30)
117	std		%r4, -0xf8(%r30)
118	std		%r5, -0xf0(%r30)
119	ldo		0(%r0), climb		C clear climb
120	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
121
122define(`p032a1',`%r1')	C
123define(`p032a2',`%r19')	C
124
125define(`m032',`%r20')	C
126define(`m096',`%r21')	C
127
128define(`p000a',`%r22')	C
129define(`p064a',`%r29')	C
130
131define(`s000',`%r31')	C
132
133define(`ma000',`%r4')	C
134define(`ma064',`%r20')	C
135
136define(`r000',`%r3')	C
137
138	extrd,u		n, 63, 2, %r5
139	cmpb,=		%r5, %r0, L(BIG)
140	nop
141
142	fldd		0(up), %fr4
143	ldo		8(up), up
144	xmpyu		%fr8R, %fr4L, %fr22
145	xmpyu		%fr8L, %fr4R, %fr23
146	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
147	xmpyu		%fr8R, %fr4R, %fr24
148	xmpyu		%fr8L, %fr4L, %fr25
149	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
150	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
151	addib,<>	-1, %r5, L(two_or_more)
152	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
153LDEF(one)
154	ldd		-0x78(%r30), p032a1
155	ldd		-0x70(%r30), p032a2
156	ldd		-0x80(%r30), p000a
157	b		L(0_one_out)
158	ldd		-0x68(%r30), p064a
159
160LDEF(two_or_more)
161	fldd		0(up), %fr4
162	ldo		8(up), up
163	xmpyu		%fr8R, %fr4L, %fr22
164	xmpyu		%fr8L, %fr4R, %fr23
165	ldd		-0x78(%r30), p032a1
166	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
167	xmpyu		%fr8R, %fr4R, %fr24
168	xmpyu		%fr8L, %fr4L, %fr25
169	ldd		-0x70(%r30), p032a2
170	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
171	ldd		-0x80(%r30), p000a
172	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
173	ldd		-0x68(%r30), p064a
174	addib,<>	-1, %r5, L(three_or_more)
175	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
176LDEF(two)
177	add		p032a1, p032a2, m032
178	add,dc		%r0, %r0, m096
179	depd,z		m032, 31, 32, ma000
180	extrd,u		m032, 31, 32, ma064
181	ldd		0(rp), r000
182	b		L(0_two_out)
183	depd		m096, 31, 32, ma064
184
185LDEF(three_or_more)
186	fldd		0(up), %fr4
187	add		p032a1, p032a2, m032
188	add,dc		%r0, %r0, m096
189	depd,z		m032, 31, 32, ma000
190	extrd,u		m032, 31, 32, ma064
191	ldd		0(rp), r000
192C	addib,=		-1, %r5, L(0_out)
193	depd		m096, 31, 32, ma064
194LDEF(loop0)
195C	xmpyu		%fr8R, %fr4L, %fr22
196C	xmpyu		%fr8L, %fr4R, %fr23
197C	ldd		-0x78(%r30), p032a1
198C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
199C
200C	xmpyu		%fr8R, %fr4R, %fr24
201C	xmpyu		%fr8L, %fr4L, %fr25
202C	ldd		-0x70(%r30), p032a2
203C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
204C
205C	ldo		8(rp), rp
206C	add		climb, p000a, s000
207C	ldd		-0x80(%r30), p000a
208C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
209C
210C	add,dc		p064a, %r0, climb
211C	ldo		8(up), up
212C	ldd		-0x68(%r30), p064a
213C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
214C
215C	add		ma000, s000, s000
216C	add,dc		ma064, climb, climb
217C	fldd		0(up), %fr4
218C
219C	add		r000, s000, s000
220C	add,dc		%r0, climb, climb
221C	std		s000, -8(rp)
222C
223C	add		p032a1, p032a2, m032
224C	add,dc		%r0, %r0, m096
225C
226C	depd,z		m032, 31, 32, ma000
227C	extrd,u		m032, 31, 32, ma064
228C	ldd		0(rp), r000
229C	addib,<>	-1, %r5, L(loop0)
230C	depd		m096, 31, 32, ma064
231LDEF(0_out)
232	ldo		8(up), up
233	xmpyu		%fr8R, %fr4L, %fr22
234	xmpyu		%fr8L, %fr4R, %fr23
235	ldd		-0x78(%r30), p032a1
236	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
237	xmpyu		%fr8R, %fr4R, %fr24
238	xmpyu		%fr8L, %fr4L, %fr25
239	ldd		-0x70(%r30), p032a2
240	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
241	ldo		8(rp), rp
242	add		climb, p000a, s000
243	ldd		-0x80(%r30), p000a
244	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
245	add,dc		p064a, %r0, climb
246	ldd		-0x68(%r30), p064a
247	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
248	add		ma000, s000, s000
249	add,dc		ma064, climb, climb
250	add		r000, s000, s000
251	add,dc		%r0, climb, climb
252	std		s000, -8(rp)
253	add		p032a1, p032a2, m032
254	add,dc		%r0, %r0, m096
255	depd,z		m032, 31, 32, ma000
256	extrd,u		m032, 31, 32, ma064
257	ldd		0(rp), r000
258	depd		m096, 31, 32, ma064
259LDEF(0_two_out)
260	ldd		-0x78(%r30), p032a1
261	ldd		-0x70(%r30), p032a2
262	ldo		8(rp), rp
263	add		climb, p000a, s000
264	ldd		-0x80(%r30), p000a
265	add,dc		p064a, %r0, climb
266	ldd		-0x68(%r30), p064a
267	add		ma000, s000, s000
268	add,dc		ma064, climb, climb
269	add		r000, s000, s000
270	add,dc		%r0, climb, climb
271	std		s000, -8(rp)
272LDEF(0_one_out)
273	add		p032a1, p032a2, m032
274	add,dc		%r0, %r0, m096
275	depd,z		m032, 31, 32, ma000
276	extrd,u		m032, 31, 32, ma064
277	ldd		0(rp), r000
278	depd		m096, 31, 32, ma064
279
280	add		climb, p000a, s000
281	add,dc		p064a, %r0, climb
282	add		ma000, s000, s000
283	add,dc		ma064, climb, climb
284	add		r000, s000, s000
285	add,dc		%r0, climb, climb
286	std		s000, 0(rp)
287
288	cmpib,>=	4, n, L(done)
289	ldo		8(rp), rp
290
291C 4-way unrolled code.
292
293LDEF(BIG)
294
295define(`p032a1',`%r1')	C
296define(`p032a2',`%r19')	C
297define(`p096b1',`%r20')	C
298define(`p096b2',`%r21')	C
299define(`p160c1',`%r22')	C
300define(`p160c2',`%r29')	C
301define(`p224d1',`%r31')	C
302define(`p224d2',`%r3')	C
303			C
304define(`m032',`%r4')	C
305define(`m096',`%r5')	C
306define(`m160',`%r6')	C
307define(`m224',`%r7')	C
308define(`m288',`%r8')	C
309			C
310define(`p000a',`%r1')	C
311define(`p064a',`%r19')	C
312define(`p064b',`%r20')	C
313define(`p128b',`%r21')	C
314define(`p128c',`%r22')	C
315define(`p192c',`%r29')	C
316define(`p192d',`%r31')	C
317define(`p256d',`%r3')	C
318			C
319define(`s000',`%r10')	C
320define(`s064',`%r11')	C
321define(`s128',`%r12')	C
322define(`s192',`%r13')	C
323			C
324define(`ma000',`%r9')	C
325define(`ma064',`%r4')	C
326define(`ma128',`%r5')	C
327define(`ma192',`%r6')	C
328define(`ma256',`%r7')	C
329			C
330define(`r000',`%r1')	C
331define(`r064',`%r19')	C
332define(`r128',`%r20')	C
333define(`r192',`%r21')	C
334
335	std		%r6, -0xe8(%r30)
336	std		%r7, -0xe0(%r30)
337	std		%r8, -0xd8(%r30)
338	std		%r9, -0xd0(%r30)
339	std		%r10, -0xc8(%r30)
340	std		%r11, -0xc0(%r30)
341	std		%r12, -0xb8(%r30)
342	std		%r13, -0xb0(%r30)
343
344ifdef(`HAVE_ABI_2_0w',
345`	extrd,u		n, 61, 62, n		C right shift 2
346',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
347')
348
349LDEF(4_or_more)
350	fldd		0(up), %fr4
351	fldd		8(up), %fr5
352	fldd		16(up), %fr6
353	fldd		24(up), %fr7
354	xmpyu		%fr8R, %fr4L, %fr22
355	xmpyu		%fr8L, %fr4R, %fr23
356	xmpyu		%fr8R, %fr5L, %fr24
357	xmpyu		%fr8L, %fr5R, %fr25
358	xmpyu		%fr8R, %fr6L, %fr26
359	xmpyu		%fr8L, %fr6R, %fr27
360	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
361	xmpyu		%fr8R, %fr7L, %fr28
362	xmpyu		%fr8L, %fr7R, %fr29
363	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
364	xmpyu		%fr8R, %fr4R, %fr30
365	xmpyu		%fr8L, %fr4L, %fr31
366	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
367	xmpyu		%fr8R, %fr5R, %fr22
368	xmpyu		%fr8L, %fr5L, %fr23
369	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
370	xmpyu		%fr8R, %fr6R, %fr24
371	xmpyu		%fr8L, %fr6L, %fr25
372	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
373	xmpyu		%fr8R, %fr7R, %fr26
374	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
375	addib,<>	-1, n, L(8_or_more)
376	xmpyu		%fr8L, %fr7L, %fr27
377	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
378	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
379	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
380	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
381	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
382	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
383	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
384	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
385	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
386	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
387	ldd		-0x78(%r30), p032a1
388	ldd		-0x70(%r30), p032a2
389	ldd		-0x38(%r30), p096b1
390	ldd		-0x30(%r30), p096b2
391	ldd		-0x58(%r30), p160c1
392	ldd		-0x50(%r30), p160c2
393	ldd		-0x18(%r30), p224d1
394	ldd		-0x10(%r30), p224d2
395	b		L(end1)
396	nop
397
398LDEF(8_or_more)
399	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
400	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
401	ldo		32(up), up
402	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
403	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
404	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
405	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
406	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
407	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
408	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
409	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
410	fldd		0(up), %fr4
411	fldd		8(up), %fr5
412	fldd		16(up), %fr6
413	fldd		24(up), %fr7
414	xmpyu		%fr8R, %fr4L, %fr22
415	ldd		-0x78(%r30), p032a1
416	xmpyu		%fr8L, %fr4R, %fr23
417	xmpyu		%fr8R, %fr5L, %fr24
418	ldd		-0x70(%r30), p032a2
419	xmpyu		%fr8L, %fr5R, %fr25
420	xmpyu		%fr8R, %fr6L, %fr26
421	ldd		-0x38(%r30), p096b1
422	xmpyu		%fr8L, %fr6R, %fr27
423	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
424	xmpyu		%fr8R, %fr7L, %fr28
425	ldd		-0x30(%r30), p096b2
426	xmpyu		%fr8L, %fr7R, %fr29
427	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
428	xmpyu		%fr8R, %fr4R, %fr30
429	ldd		-0x58(%r30), p160c1
430	xmpyu		%fr8L, %fr4L, %fr31
431	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
432	xmpyu		%fr8R, %fr5R, %fr22
433	ldd		-0x50(%r30), p160c2
434	xmpyu		%fr8L, %fr5L, %fr23
435	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
436	xmpyu		%fr8R, %fr6R, %fr24
437	ldd		-0x18(%r30), p224d1
438	xmpyu		%fr8L, %fr6L, %fr25
439	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
440	xmpyu		%fr8R, %fr7R, %fr26
441	ldd		-0x10(%r30), p224d2
442	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
443	addib,=		-1, n, L(end2)
444	xmpyu		%fr8L, %fr7L, %fr27
445LDEF(loop)
446	add		p032a1, p032a2, m032
447	ldd		-0x80(%r30), p000a
448	add,dc		p096b1, p096b2, m096
449	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
450
451	add,dc		p160c1, p160c2, m160
452	ldd		-0x68(%r30), p064a
453	add,dc		p224d1, p224d2, m224
454	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
455
456	add,dc		%r0, %r0, m288
457	ldd		-0x40(%r30), p064b
458	ldo		32(up), up
459	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
460
461	depd,z		m032, 31, 32, ma000
462	ldd		-0x28(%r30), p128b
463	extrd,u		m032, 31, 32, ma064
464	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
465
466	depd		m096, 31, 32, ma064
467	ldd		-0x60(%r30), p128c
468	extrd,u		m096, 31, 32, ma128
469	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
470
471	depd		m160, 31, 32, ma128
472	ldd		-0x48(%r30), p192c
473	extrd,u		m160, 31, 32, ma192
474	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
475
476	depd		m224, 31, 32, ma192
477	ldd		-0x20(%r30), p192d
478	extrd,u		m224, 31, 32, ma256
479	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
480
481	depd		m288, 31, 32, ma256
482	ldd		-0x88(%r30), p256d
483	add		climb, p000a, s000
484	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
485
486	add,dc		p064a, p064b, s064
487	ldd		0(rp), r000
488	add,dc		p128b, p128c, s128
489	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
490
491	add,dc		p192c, p192d, s192
492	ldd		8(rp), r064
493	add,dc		p256d, %r0, climb
494	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
495
496	ldd		16(rp), r128
497	add		ma000, s000, s000	C accum mid 0
498	ldd		24(rp), r192
499	add,dc		ma064, s064, s064	C accum mid 1
500
501	add,dc		ma128, s128, s128	C accum mid 2
502	fldd		0(up), %fr4
503	add,dc		ma192, s192, s192	C accum mid 3
504	fldd		8(up), %fr5
505
506	add,dc		ma256, climb, climb
507	fldd		16(up), %fr6
508	add		r000, s000, s000	C accum rlimb 0
509	fldd		24(up), %fr7
510
511	add,dc		r064, s064, s064	C accum rlimb 1
512	add,dc		r128, s128, s128	C accum rlimb 2
513	std		s000, 0(rp)
514
515	add,dc		r192, s192, s192	C accum rlimb 3
516	add,dc		%r0, climb, climb
517	std		s064, 8(rp)
518
519	xmpyu		%fr8R, %fr4L, %fr22
520	ldd		-0x78(%r30), p032a1
521	xmpyu		%fr8L, %fr4R, %fr23
522	std		s128, 16(rp)
523
524	xmpyu		%fr8R, %fr5L, %fr24
525	ldd		-0x70(%r30), p032a2
526	xmpyu		%fr8L, %fr5R, %fr25
527	std		s192, 24(rp)
528
529	xmpyu		%fr8R, %fr6L, %fr26
530	ldd		-0x38(%r30), p096b1
531	xmpyu		%fr8L, %fr6R, %fr27
532	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
533
534	xmpyu		%fr8R, %fr7L, %fr28
535	ldd		-0x30(%r30), p096b2
536	xmpyu		%fr8L, %fr7R, %fr29
537	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
538
539	xmpyu		%fr8R, %fr4R, %fr30
540	ldd		-0x58(%r30), p160c1
541	xmpyu		%fr8L, %fr4L, %fr31
542	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
543
544	xmpyu		%fr8R, %fr5R, %fr22
545	ldd		-0x50(%r30), p160c2
546	xmpyu		%fr8L, %fr5L, %fr23
547	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
548
549	xmpyu		%fr8R, %fr6R, %fr24
550	ldd		-0x18(%r30), p224d1
551	xmpyu		%fr8L, %fr6L, %fr25
552	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
553
554	xmpyu		%fr8R, %fr7R, %fr26
555	ldd		-0x10(%r30), p224d2
556	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
557	xmpyu		%fr8L, %fr7L, %fr27
558
559	addib,<>	-1, n, L(loop)
560	ldo		32(rp), rp
561
562LDEF(end2)
563	add		p032a1, p032a2, m032
564	ldd		-0x80(%r30), p000a
565	add,dc		p096b1, p096b2, m096
566	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
567	add,dc		p160c1, p160c2, m160
568	ldd		-0x68(%r30), p064a
569	add,dc		p224d1, p224d2, m224
570	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
571	add,dc		%r0, %r0, m288
572	ldd		-0x40(%r30), p064b
573	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
574	depd,z		m032, 31, 32, ma000
575	ldd		-0x28(%r30), p128b
576	extrd,u		m032, 31, 32, ma064
577	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
578	depd		m096, 31, 32, ma064
579	ldd		-0x60(%r30), p128c
580	extrd,u		m096, 31, 32, ma128
581	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
582	depd		m160, 31, 32, ma128
583	ldd		-0x48(%r30), p192c
584	extrd,u		m160, 31, 32, ma192
585	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
586	depd		m224, 31, 32, ma192
587	ldd		-0x20(%r30), p192d
588	extrd,u		m224, 31, 32, ma256
589	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
590	depd		m288, 31, 32, ma256
591	ldd		-0x88(%r30), p256d
592	add		climb, p000a, s000
593	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
594	add,dc		p064a, p064b, s064
595	ldd		0(rp), r000
596	add,dc		p128b, p128c, s128
597	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
598	add,dc		p192c, p192d, s192
599	ldd		8(rp), r064
600	add,dc		p256d, %r0, climb
601	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
602	ldd		16(rp), r128
603	add		ma000, s000, s000	C accum mid 0
604	ldd		24(rp), r192
605	add,dc		ma064, s064, s064	C accum mid 1
606	add,dc		ma128, s128, s128	C accum mid 2
607	add,dc		ma192, s192, s192	C accum mid 3
608	add,dc		ma256, climb, climb
609	add		r000, s000, s000	C accum rlimb 0
610	add,dc		r064, s064, s064	C accum rlimb 1
611	add,dc		r128, s128, s128	C accum rlimb 2
612	std		s000, 0(rp)
613	add,dc		r192, s192, s192	C accum rlimb 3
614	add,dc		%r0, climb, climb
615	std		s064, 8(rp)
616	ldd		-0x78(%r30), p032a1
617	std		s128, 16(rp)
618	ldd		-0x70(%r30), p032a2
619	std		s192, 24(rp)
620	ldd		-0x38(%r30), p096b1
621	ldd		-0x30(%r30), p096b2
622	ldd		-0x58(%r30), p160c1
623	ldd		-0x50(%r30), p160c2
624	ldd		-0x18(%r30), p224d1
625	ldd		-0x10(%r30), p224d2
626	ldo		32(rp), rp
627
628LDEF(end1)
629	add		p032a1, p032a2, m032
630	ldd		-0x80(%r30), p000a
631	add,dc		p096b1, p096b2, m096
632	add,dc		p160c1, p160c2, m160
633	ldd		-0x68(%r30), p064a
634	add,dc		p224d1, p224d2, m224
635	add,dc		%r0, %r0, m288
636	ldd		-0x40(%r30), p064b
637	depd,z		m032, 31, 32, ma000
638	ldd		-0x28(%r30), p128b
639	extrd,u		m032, 31, 32, ma064
640	depd		m096, 31, 32, ma064
641	ldd		-0x60(%r30), p128c
642	extrd,u		m096, 31, 32, ma128
643	depd		m160, 31, 32, ma128
644	ldd		-0x48(%r30), p192c
645	extrd,u		m160, 31, 32, ma192
646	depd		m224, 31, 32, ma192
647	ldd		-0x20(%r30), p192d
648	extrd,u		m224, 31, 32, ma256
649	depd		m288, 31, 32, ma256
650	ldd		-0x88(%r30), p256d
651	add		climb, p000a, s000
652	add,dc		p064a, p064b, s064
653	ldd		0(rp), r000
654	add,dc		p128b, p128c, s128
655	add,dc		p192c, p192d, s192
656	ldd		8(rp), r064
657	add,dc		p256d, %r0, climb
658	ldd		16(rp), r128
659	add		ma000, s000, s000	C accum mid 0
660	ldd		24(rp), r192
661	add,dc		ma064, s064, s064	C accum mid 1
662	add,dc		ma128, s128, s128	C accum mid 2
663	add,dc		ma192, s192, s192	C accum mid 3
664	add,dc		ma256, climb, climb
665	add		r000, s000, s000	C accum rlimb 0
666	add,dc		r064, s064, s064	C accum rlimb 1
667	add,dc		r128, s128, s128	C accum rlimb 2
668	std		s000, 0(rp)
669	add,dc		r192, s192, s192	C accum rlimb 3
670	add,dc		%r0, climb, climb
671	std		s064, 8(rp)
672	std		s128, 16(rp)
673	std		s192, 24(rp)
674
675	ldd		-0xb0(%r30), %r13
676	ldd		-0xb8(%r30), %r12
677	ldd		-0xc0(%r30), %r11
678	ldd		-0xc8(%r30), %r10
679	ldd		-0xd0(%r30), %r9
680	ldd		-0xd8(%r30), %r8
681	ldd		-0xe0(%r30), %r7
682	ldd		-0xe8(%r30), %r6
683LDEF(done)
684ifdef(`HAVE_ABI_2_0w',
685`	copy		climb, %r28
686',`	extrd,u		climb, 63, 32, %r29
687	extrd,u		climb, 31, 32, %r28
688')
689	ldd		-0xf0(%r30), %r5
690	ldd		-0xf8(%r30), %r4
691	bve		(%r2)
692	ldd,mb		-0x100(%r30), %r3
693EPILOGUE(mpn_addmul_1)
694