xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa64/mul_1.asm (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1dnl  HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2dnl  the result in a second limb vector.
3
4dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C		    cycles/limb
35C 8000,8200:		6.5
36C 8500,8600,8700:	5.625
37
38C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
39C  could be saved there per call.
40
41C  DESCRIPTION:
42C  The main loop "BIG" is 4-way unrolled, mainly to allow
43C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
44C  registers to the IU registers, have demanded a deep software pipeline, and
45C  a lot of stack slots for partial products in flight.
46C
47C  CODE STRUCTURE:
48C  save-some-registers
49C  do 0, 1, 2, or 3 limbs
50C  if done, restore-some-regs and return
51C  save-many-regs
52C  do 4, 8, ... limb
53C  restore-all-regs
54
55C  STACK LAYOUT:
56C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
57C  slots marked FREE, as well as some slots in the caller's "frame marker".
58C
59C -00 <- r30
60C -08  FREE
61C -10  tmp
62C -18  tmp
63C -20  tmp
64C -28  tmp
65C -30  tmp
66C -38  tmp
67C -40  tmp
68C -48  tmp
69C -50  tmp
70C -58  tmp
71C -60  tmp
72C -68  tmp
73C -70  tmp
74C -78  tmp
75C -80  tmp
76C -88  tmp
77C -90  FREE
78C -98  FREE
79C -a0  FREE
80C -a8  FREE
81C -b0  r13
82C -b8  r12
83C -c0  r11
84C -c8  r10
85C -d0  r8
86C -d8  r8
87C -e0  r7
88C -e8  r6
89C -f0  r5
90C -f8  r4
91C -100 r3
92C  Previous frame:
93C  [unused area]
94C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
95
96
97include(`../config.m4')
98
99C INPUT PARAMETERS:
100define(`rp',`%r26')	C
101define(`up',`%r25')	C
102define(`n',`%r24')	C
103define(`vlimb',`%r23')	C
104
105define(`climb',`%r23')	C
106
107ifdef(`HAVE_ABI_2_0w',
108`	.level	2.0w
109',`	.level	2.0
110')
111PROLOGUE(mpn_mul_1)
112
113ifdef(`HAVE_ABI_2_0w',
114`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
115')
116	std,ma		%r3, 0x100(%r30)
117	std		%r4, -0xf8(%r30)
118	std		%r5, -0xf0(%r30)
119	ldo		0(%r0), climb		C clear climb
120	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
121
122define(`p032a1',`%r1')	C
123define(`p032a2',`%r19')	C
124
125define(`m032',`%r20')	C
126define(`m096',`%r21')	C
127
128define(`p000a',`%r22')	C
129define(`p064a',`%r29')	C
130
131define(`s000',`%r31')	C
132
133define(`ma000',`%r4')	C
134define(`ma064',`%r20')	C
135
136C define(`r000',`%r3')	C	FIXME don't save r3 for n < 4.
137
138	extrd,u		n, 63, 2, %r5
139	cmpb,=		%r5, %r0, L(BIG)
140	nop
141
142	fldd		0(up), %fr4
143	ldo		8(up), up
144	xmpyu		%fr8R, %fr4L, %fr22
145	xmpyu		%fr8L, %fr4R, %fr23
146	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
147	xmpyu		%fr8R, %fr4R, %fr24
148	xmpyu		%fr8L, %fr4L, %fr25
149	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
150	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
151	addib,<>	-1, %r5, L(two_or_more)
152	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
153LDEF(one)
154	ldd		-0x78(%r30), p032a1
155	ldd		-0x70(%r30), p032a2
156	ldd		-0x80(%r30), p000a
157	b		L(0_one_out)
158	ldd		-0x68(%r30), p064a
159
160LDEF(two_or_more)
161	fldd		0(up), %fr4
162	ldo		8(up), up
163	xmpyu		%fr8R, %fr4L, %fr22
164	xmpyu		%fr8L, %fr4R, %fr23
165	ldd		-0x78(%r30), p032a1
166	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
167	xmpyu		%fr8R, %fr4R, %fr24
168	xmpyu		%fr8L, %fr4L, %fr25
169	ldd		-0x70(%r30), p032a2
170	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
171	ldd		-0x80(%r30), p000a
172	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
173	ldd		-0x68(%r30), p064a
174	addib,<>	-1, %r5, L(three_or_more)
175	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
176LDEF(two)
177	add		p032a1, p032a2, m032
178	add,dc		%r0, %r0, m096
179	depd,z		m032, 31, 32, ma000
180	extrd,u		m032, 31, 32, ma064
181	b		L(0_two_out)
182	depd		m096, 31, 32, ma064
183
184LDEF(three_or_more)
185	fldd		0(up), %fr4
186	add		p032a1, p032a2, m032
187	add,dc		%r0, %r0, m096
188	depd,z		m032, 31, 32, ma000
189	extrd,u		m032, 31, 32, ma064
190C	addib,=		-1, %r5, L(0_out)
191	depd		m096, 31, 32, ma064
192LDEF(loop0)
193C	xmpyu		%fr8R, %fr4L, %fr22
194C	xmpyu		%fr8L, %fr4R, %fr23
195C	ldd		-0x78(%r30), p032a1
196C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
197C
198C	xmpyu		%fr8R, %fr4R, %fr24
199C	xmpyu		%fr8L, %fr4L, %fr25
200C	ldd		-0x70(%r30), p032a2
201C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
202C
203C	ldo		8(rp), rp
204C	add		climb, p000a, s000
205C	ldd		-0x80(%r30), p000a
206C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
207C
208C	add,dc		p064a, %r0, climb
209C	ldo		8(up), up
210C	ldd		-0x68(%r30), p064a
211C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
212C
213C	add		ma000, s000, s000
214C	add,dc		ma064, climb, climb
215C	fldd		0(up), %fr4
216C
217C	std		s000, -8(rp)
218C
219C	add		p032a1, p032a2, m032
220C	add,dc		%r0, %r0, m096
221C
222C	depd,z		m032, 31, 32, ma000
223C	extrd,u		m032, 31, 32, ma064
224C	addib,<>	-1, %r5, L(loop0)
225C	depd		m096, 31, 32, ma064
226LDEF(0_out)
227	ldo		8(up), up
228	xmpyu		%fr8R, %fr4L, %fr22
229	xmpyu		%fr8L, %fr4R, %fr23
230	ldd		-0x78(%r30), p032a1
231	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
232	xmpyu		%fr8R, %fr4R, %fr24
233	xmpyu		%fr8L, %fr4L, %fr25
234	ldd		-0x70(%r30), p032a2
235	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
236	ldo		8(rp), rp
237	add		climb, p000a, s000
238	ldd		-0x80(%r30), p000a
239	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
240	add,dc		p064a, %r0, climb
241	ldd		-0x68(%r30), p064a
242	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
243	add		ma000, s000, s000
244	add,dc		ma064, climb, climb
245	std		s000, -8(rp)
246	add		p032a1, p032a2, m032
247	add,dc		%r0, %r0, m096
248	depd,z		m032, 31, 32, ma000
249	extrd,u		m032, 31, 32, ma064
250	depd		m096, 31, 32, ma064
251LDEF(0_two_out)
252	ldd		-0x78(%r30), p032a1
253	ldd		-0x70(%r30), p032a2
254	ldo		8(rp), rp
255	add		climb, p000a, s000
256	ldd		-0x80(%r30), p000a
257	add,dc		p064a, %r0, climb
258	ldd		-0x68(%r30), p064a
259	add		ma000, s000, s000
260	add,dc		ma064, climb, climb
261	std		s000, -8(rp)
262LDEF(0_one_out)
263	add		p032a1, p032a2, m032
264	add,dc		%r0, %r0, m096
265	depd,z		m032, 31, 32, ma000
266	extrd,u		m032, 31, 32, ma064
267	depd		m096, 31, 32, ma064
268
269	add		climb, p000a, s000
270	add,dc		p064a, %r0, climb
271	add		ma000, s000, s000
272	add,dc		ma064, climb, climb
273	std		s000, 0(rp)
274
275	cmpib,>=	4, n, L(done)
276	ldo		8(rp), rp
277
278C 4-way unrolled code.
279
280LDEF(BIG)
281
282define(`p032a1',`%r1')	C
283define(`p032a2',`%r19')	C
284define(`p096b1',`%r20')	C
285define(`p096b2',`%r21')	C
286define(`p160c1',`%r22')	C
287define(`p160c2',`%r29')	C
288define(`p224d1',`%r31')	C
289define(`p224d2',`%r3')	C
290			C
291define(`m032',`%r4')	C
292define(`m096',`%r5')	C
293define(`m160',`%r6')	C
294define(`m224',`%r7')	C
295define(`m288',`%r8')	C
296			C
297define(`p000a',`%r1')	C
298define(`p064a',`%r19')	C
299define(`p064b',`%r20')	C
300define(`p128b',`%r21')	C
301define(`p128c',`%r22')	C
302define(`p192c',`%r29')	C
303define(`p192d',`%r31')	C
304define(`p256d',`%r3')	C
305			C
306define(`s000',`%r10')	C
307define(`s064',`%r11')	C
308define(`s128',`%r12')	C
309define(`s192',`%r13')	C
310			C
311define(`ma000',`%r9')	C
312define(`ma064',`%r4')	C
313define(`ma128',`%r5')	C
314define(`ma192',`%r6')	C
315define(`ma256',`%r7')	C
316
317	std		%r6, -0xe8(%r30)
318	std		%r7, -0xe0(%r30)
319	std		%r8, -0xd8(%r30)
320	std		%r9, -0xd0(%r30)
321	std		%r10, -0xc8(%r30)
322	std		%r11, -0xc0(%r30)
323	std		%r12, -0xb8(%r30)
324	std		%r13, -0xb0(%r30)
325
326ifdef(`HAVE_ABI_2_0w',
327`	extrd,u		n, 61, 62, n		C right shift 2
328',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
329')
330
331LDEF(4_or_more)
332	fldd		0(up), %fr4
333	fldd		8(up), %fr5
334	fldd		16(up), %fr6
335	fldd		24(up), %fr7
336	xmpyu		%fr8R, %fr4L, %fr22
337	xmpyu		%fr8L, %fr4R, %fr23
338	xmpyu		%fr8R, %fr5L, %fr24
339	xmpyu		%fr8L, %fr5R, %fr25
340	xmpyu		%fr8R, %fr6L, %fr26
341	xmpyu		%fr8L, %fr6R, %fr27
342	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
343	xmpyu		%fr8R, %fr7L, %fr28
344	xmpyu		%fr8L, %fr7R, %fr29
345	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
346	xmpyu		%fr8R, %fr4R, %fr30
347	xmpyu		%fr8L, %fr4L, %fr31
348	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
349	xmpyu		%fr8R, %fr5R, %fr22
350	xmpyu		%fr8L, %fr5L, %fr23
351	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
352	xmpyu		%fr8R, %fr6R, %fr24
353	xmpyu		%fr8L, %fr6L, %fr25
354	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
355	xmpyu		%fr8R, %fr7R, %fr26
356	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
357	addib,<>	-1, n, L(8_or_more)
358	xmpyu		%fr8L, %fr7L, %fr27
359	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
360	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
361	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
362	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
363	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
364	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
365	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
366	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
367	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
368	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
369	ldd		-0x78(%r30), p032a1
370	ldd		-0x70(%r30), p032a2
371	ldd		-0x38(%r30), p096b1
372	ldd		-0x30(%r30), p096b2
373	ldd		-0x58(%r30), p160c1
374	ldd		-0x50(%r30), p160c2
375	ldd		-0x18(%r30), p224d1
376	ldd		-0x10(%r30), p224d2
377	b		L(end1)
378	nop
379
380LDEF(8_or_more)
381	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
382	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
383	ldo		32(up), up
384	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
385	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
386	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
387	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
388	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
389	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
390	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
391	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
392	fldd		0(up), %fr4
393	fldd		8(up), %fr5
394	fldd		16(up), %fr6
395	fldd		24(up), %fr7
396	xmpyu		%fr8R, %fr4L, %fr22
397	ldd		-0x78(%r30), p032a1
398	xmpyu		%fr8L, %fr4R, %fr23
399	xmpyu		%fr8R, %fr5L, %fr24
400	ldd		-0x70(%r30), p032a2
401	xmpyu		%fr8L, %fr5R, %fr25
402	xmpyu		%fr8R, %fr6L, %fr26
403	ldd		-0x38(%r30), p096b1
404	xmpyu		%fr8L, %fr6R, %fr27
405	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
406	xmpyu		%fr8R, %fr7L, %fr28
407	ldd		-0x30(%r30), p096b2
408	xmpyu		%fr8L, %fr7R, %fr29
409	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
410	xmpyu		%fr8R, %fr4R, %fr30
411	ldd		-0x58(%r30), p160c1
412	xmpyu		%fr8L, %fr4L, %fr31
413	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
414	xmpyu		%fr8R, %fr5R, %fr22
415	ldd		-0x50(%r30), p160c2
416	xmpyu		%fr8L, %fr5L, %fr23
417	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
418	xmpyu		%fr8R, %fr6R, %fr24
419	ldd		-0x18(%r30), p224d1
420	xmpyu		%fr8L, %fr6L, %fr25
421	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
422	xmpyu		%fr8R, %fr7R, %fr26
423	ldd		-0x10(%r30), p224d2
424	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
425	addib,=		-1, n, L(end2)
426	xmpyu		%fr8L, %fr7L, %fr27
427LDEF(loop)
428	add		p032a1, p032a2, m032
429	ldd		-0x80(%r30), p000a
430	add,dc		p096b1, p096b2, m096
431	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
432
433	add,dc		p160c1, p160c2, m160
434	ldd		-0x68(%r30), p064a
435	add,dc		p224d1, p224d2, m224
436	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
437
438	add,dc		%r0, %r0, m288
439	ldd		-0x40(%r30), p064b
440	ldo		32(up), up
441	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
442
443	depd,z		m032, 31, 32, ma000
444	ldd		-0x28(%r30), p128b
445	extrd,u		m032, 31, 32, ma064
446	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
447
448	depd		m096, 31, 32, ma064
449	ldd		-0x60(%r30), p128c
450	extrd,u		m096, 31, 32, ma128
451	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
452
453	depd		m160, 31, 32, ma128
454	ldd		-0x48(%r30), p192c
455	extrd,u		m160, 31, 32, ma192
456	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
457
458	depd		m224, 31, 32, ma192
459	ldd		-0x20(%r30), p192d
460	extrd,u		m224, 31, 32, ma256
461	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
462
463	depd		m288, 31, 32, ma256
464	ldd		-0x88(%r30), p256d
465	add		climb, p000a, s000
466	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
467
468	add,dc		p064a, p064b, s064
469	add,dc		p128b, p128c, s128
470	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
471
472	add,dc		p192c, p192d, s192
473	add,dc		p256d, %r0, climb
474	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
475
476	add		ma000, s000, s000	C accum mid 0
477	fldd		0(up), %fr4
478	add,dc		ma064, s064, s064	C accum mid 1
479	std		s000, 0(rp)
480
481	add,dc		ma128, s128, s128	C accum mid 2
482	fldd		8(up), %fr5
483	add,dc		ma192, s192, s192	C accum mid 3
484	std		s064, 8(rp)
485
486	add,dc		ma256, climb, climb
487	fldd		16(up), %fr6
488	std		s128, 16(rp)
489
490	xmpyu		%fr8R, %fr4L, %fr22
491	ldd		-0x78(%r30), p032a1
492	xmpyu		%fr8L, %fr4R, %fr23
493	fldd		24(up), %fr7
494
495	xmpyu		%fr8R, %fr5L, %fr24
496	ldd		-0x70(%r30), p032a2
497	xmpyu		%fr8L, %fr5R, %fr25
498	std		s192, 24(rp)
499
500	xmpyu		%fr8R, %fr6L, %fr26
501	ldd		-0x38(%r30), p096b1
502	xmpyu		%fr8L, %fr6R, %fr27
503	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
504
505	xmpyu		%fr8R, %fr7L, %fr28
506	ldd		-0x30(%r30), p096b2
507	xmpyu		%fr8L, %fr7R, %fr29
508	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
509
510	xmpyu		%fr8R, %fr4R, %fr30
511	ldd		-0x58(%r30), p160c1
512	xmpyu		%fr8L, %fr4L, %fr31
513	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
514
515	xmpyu		%fr8R, %fr5R, %fr22
516	ldd		-0x50(%r30), p160c2
517	xmpyu		%fr8L, %fr5L, %fr23
518	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
519
520	xmpyu		%fr8R, %fr6R, %fr24
521	ldd		-0x18(%r30), p224d1
522	xmpyu		%fr8L, %fr6L, %fr25
523	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
524
525	xmpyu		%fr8R, %fr7R, %fr26
526	ldd		-0x10(%r30), p224d2
527	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
528	xmpyu		%fr8L, %fr7L, %fr27
529
530	addib,<>	-1, n, L(loop)
531	ldo		32(rp), rp
532
533LDEF(end2)
534	add		p032a1, p032a2, m032
535	ldd		-0x80(%r30), p000a
536	add,dc		p096b1, p096b2, m096
537	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
538	add,dc		p160c1, p160c2, m160
539	ldd		-0x68(%r30), p064a
540	add,dc		p224d1, p224d2, m224
541	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
542	add,dc		%r0, %r0, m288
543	ldd		-0x40(%r30), p064b
544	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
545	depd,z		m032, 31, 32, ma000
546	ldd		-0x28(%r30), p128b
547	extrd,u		m032, 31, 32, ma064
548	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
549	depd		m096, 31, 32, ma064
550	ldd		-0x60(%r30), p128c
551	extrd,u		m096, 31, 32, ma128
552	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
553	depd		m160, 31, 32, ma128
554	ldd		-0x48(%r30), p192c
555	extrd,u		m160, 31, 32, ma192
556	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
557	depd		m224, 31, 32, ma192
558	ldd		-0x20(%r30), p192d
559	extrd,u		m224, 31, 32, ma256
560	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
561	depd		m288, 31, 32, ma256
562	ldd		-0x88(%r30), p256d
563	add		climb, p000a, s000
564	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
565	add,dc		p064a, p064b, s064
566	add,dc		p128b, p128c, s128
567	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
568	add,dc		p192c, p192d, s192
569	add,dc		p256d, %r0, climb
570	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
571	add		ma000, s000, s000	C accum mid 0
572	add,dc		ma064, s064, s064	C accum mid 1
573	add,dc		ma128, s128, s128	C accum mid 2
574	add,dc		ma192, s192, s192	C accum mid 3
575	add,dc		ma256, climb, climb
576	std		s000, 0(rp)
577	std		s064, 8(rp)
578	ldd		-0x78(%r30), p032a1
579	std		s128, 16(rp)
580	ldd		-0x70(%r30), p032a2
581	std		s192, 24(rp)
582	ldd		-0x38(%r30), p096b1
583	ldd		-0x30(%r30), p096b2
584	ldd		-0x58(%r30), p160c1
585	ldd		-0x50(%r30), p160c2
586	ldd		-0x18(%r30), p224d1
587	ldd		-0x10(%r30), p224d2
588	ldo		32(rp), rp
589
590LDEF(end1)
591	add		p032a1, p032a2, m032
592	ldd		-0x80(%r30), p000a
593	add,dc		p096b1, p096b2, m096
594	add,dc		p160c1, p160c2, m160
595	ldd		-0x68(%r30), p064a
596	add,dc		p224d1, p224d2, m224
597	add,dc		%r0, %r0, m288
598	ldd		-0x40(%r30), p064b
599	depd,z		m032, 31, 32, ma000
600	ldd		-0x28(%r30), p128b
601	extrd,u		m032, 31, 32, ma064
602	depd		m096, 31, 32, ma064
603	ldd		-0x60(%r30), p128c
604	extrd,u		m096, 31, 32, ma128
605	depd		m160, 31, 32, ma128
606	ldd		-0x48(%r30), p192c
607	extrd,u		m160, 31, 32, ma192
608	depd		m224, 31, 32, ma192
609	ldd		-0x20(%r30), p192d
610	extrd,u		m224, 31, 32, ma256
611	depd		m288, 31, 32, ma256
612	ldd		-0x88(%r30), p256d
613	add		climb, p000a, s000
614	add,dc		p064a, p064b, s064
615	add,dc		p128b, p128c, s128
616	add,dc		p192c, p192d, s192
617	add,dc		p256d, %r0, climb
618	add		ma000, s000, s000	C accum mid 0
619	add,dc		ma064, s064, s064	C accum mid 1
620	add,dc		ma128, s128, s128	C accum mid 2
621	add,dc		ma192, s192, s192	C accum mid 3
622	add,dc		ma256, climb, climb
623	std		s000, 0(rp)
624	std		s064, 8(rp)
625	std		s128, 16(rp)
626	std		s192, 24(rp)
627
628	ldd		-0xb0(%r30), %r13
629	ldd		-0xb8(%r30), %r12
630	ldd		-0xc0(%r30), %r11
631	ldd		-0xc8(%r30), %r10
632	ldd		-0xd0(%r30), %r9
633	ldd		-0xd8(%r30), %r8
634	ldd		-0xe0(%r30), %r7
635	ldd		-0xe8(%r30), %r6
636LDEF(done)
637ifdef(`HAVE_ABI_2_0w',
638`	copy		climb, %r28
639',`	extrd,u		climb, 63, 32, %r29
640	extrd,u		climb, 31, 32, %r28
641')
642	ldd		-0xf0(%r30), %r5
643	ldd		-0xf8(%r30), %r4
644	bve		(%r2)
645	ldd,mb		-0x100(%r30), %r3
646EPILOGUE(mpn_mul_1)
647