xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa64/submul_1.asm (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1dnl  HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2dnl  subtract the result from a second limb vector.
3
4dnl  Copyright 1998-2000, 2002, 2003 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34C		    cycles/limb
35C 8000,8200:		7
36C 8500,8600,8700:	6.5
37
38C  The feed-in and wind-down code has not yet been scheduled.  Many cycles
39C  could be saved there per call.
40
41C  DESCRIPTION:
42C  The main loop "BIG" is 4-way unrolled, mainly to allow
43C  effective use of ADD,DC.  Delays in moving data via the cache from the FP
44C  registers to the IU registers, have demanded a deep software pipeline, and
45C  a lot of stack slots for partial products in flight.
46C
47C  CODE STRUCTURE:
48C  save-some-registers
49C  do 0, 1, 2, or 3 limbs
50C  if done, restore-some-regs and return
51C  save-many-regs
52C  do 4, 8, ... limb
53C  restore-all-regs
54
55C  STACK LAYOUT:
56C  HP-PA stack grows upwards.  We could allocate 8 fewer slots by using the
57C  slots marked FREE, as well as some slots in the caller's "frame marker".
58C
59C -00 <- r30
60C -08  FREE
61C -10  tmp
62C -18  tmp
63C -20  tmp
64C -28  tmp
65C -30  tmp
66C -38  tmp
67C -40  tmp
68C -48  tmp
69C -50  tmp
70C -58  tmp
71C -60  tmp
72C -68  tmp
73C -70  tmp
74C -78  tmp
75C -80  tmp
76C -88  tmp
77C -90  FREE
78C -98  FREE
79C -a0  FREE
80C -a8  FREE
81C -b0  r13
82C -b8  r12
83C -c0  r11
84C -c8  r10
85C -d0  r8
86C -d8  r8
87C -e0  r7
88C -e8  r6
89C -f0  r5
90C -f8  r4
91C -100 r3
92C  Previous frame:
93C  [unused area]
94C -38/-138 vlimb home slot.  For 2.0N, the vlimb arg will arrive here.
95
96
97include(`../config.m4')
98
99C INPUT PARAMETERS:
100define(`rp',`%r26')	C
101define(`up',`%r25')	C
102define(`n',`%r24')	C
103define(`vlimb',`%r23')	C
104
105define(`climb',`%r23')	C
106
107ifdef(`HAVE_ABI_2_0w',
108`	.level	2.0w
109',`	.level	2.0
110')
111PROLOGUE(mpn_submul_1)
112
113ifdef(`HAVE_ABI_2_0w',
114`	std		vlimb, -0x38(%r30)	C store vlimb into "home" slot
115')
116	std,ma		%r3, 0x100(%r30)
117	std		%r4, -0xf8(%r30)
118	std		%r5, -0xf0(%r30)
119	ldo		0(%r0), climb		C clear climb
120	fldd		-0x138(%r30), %fr8	C put vlimb in fp register
121
122define(`p032a1',`%r1')	C
123define(`p032a2',`%r19')	C
124
125define(`m032',`%r20')	C
126define(`m096',`%r21')	C
127
128define(`p000a',`%r22')	C
129define(`p064a',`%r29')	C
130
131define(`s000',`%r31')	C
132
133define(`ma000',`%r4')	C
134define(`ma064',`%r20')	C
135
136define(`r000',`%r3')	C
137
138	extrd,u		n, 63, 2, %r5
139	cmpb,=		%r5, %r0, L(BIG)
140	nop
141
142	fldd		0(up), %fr4
143	ldo		8(up), up
144	xmpyu		%fr8R, %fr4L, %fr22
145	xmpyu		%fr8L, %fr4R, %fr23
146	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
147	xmpyu		%fr8R, %fr4R, %fr24
148	xmpyu		%fr8L, %fr4L, %fr25
149	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
150	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
151	addib,<>	-1, %r5, L(two_or_more)
152	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
153LDEF(one)
154	ldd		-0x78(%r30), p032a1
155	ldd		-0x70(%r30), p032a2
156	ldd		-0x80(%r30), p000a
157	b		L(0_one_out)
158	ldd		-0x68(%r30), p064a
159
160LDEF(two_or_more)
161	fldd		0(up), %fr4
162	ldo		8(up), up
163	xmpyu		%fr8R, %fr4L, %fr22
164	xmpyu		%fr8L, %fr4R, %fr23
165	ldd		-0x78(%r30), p032a1
166	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
167	xmpyu		%fr8R, %fr4R, %fr24
168	xmpyu		%fr8L, %fr4L, %fr25
169	ldd		-0x70(%r30), p032a2
170	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
171	ldd		-0x80(%r30), p000a
172	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
173	ldd		-0x68(%r30), p064a
174	addib,<>	-1, %r5, L(three_or_more)
175	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
176LDEF(two)
177	add		p032a1, p032a2, m032
178	add,dc		%r0, %r0, m096
179	depd,z		m032, 31, 32, ma000
180	extrd,u		m032, 31, 32, ma064
181	ldd		0(rp), r000
182	b		L(0_two_out)
183	depd		m096, 31, 32, ma064
184
185LDEF(three_or_more)
186	fldd		0(up), %fr4
187	add		p032a1, p032a2, m032
188	add,dc		%r0, %r0, m096
189	depd,z		m032, 31, 32, ma000
190	extrd,u		m032, 31, 32, ma064
191	ldd		0(rp), r000
192C	addib,=		-1, %r5, L(0_out)
193	depd		m096, 31, 32, ma064
194LDEF(loop0)
195C	xmpyu		%fr8R, %fr4L, %fr22
196C	xmpyu		%fr8L, %fr4R, %fr23
197C	ldd		-0x78(%r30), p032a1
198C	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
199C
200C	xmpyu		%fr8R, %fr4R, %fr24
201C	xmpyu		%fr8L, %fr4L, %fr25
202C	ldd		-0x70(%r30), p032a2
203C	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
204C
205C	ldo		8(rp), rp
206C	add		climb, p000a, s000
207C	ldd		-0x80(%r30), p000a
208C	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
209C
210C	add,dc		p064a, %r0, climb
211C	ldo		8(up), up
212C	ldd		-0x68(%r30), p064a
213C	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
214C
215C	add		ma000, s000, s000
216C	add,dc		ma064, climb, climb
217C	fldd		0(up), %fr4
218C
219C	sub		r000, s000, s000
220C	sub,db		%r0, climb, climb
221C	sub		%r0, climb, climb
222C	std		s000, -8(rp)
223C
224C	add		p032a1, p032a2, m032
225C	add,dc		%r0, %r0, m096
226C
227C	depd,z		m032, 31, 32, ma000
228C	extrd,u		m032, 31, 32, ma064
229C	ldd		0(rp), r000
230C	addib,<>	-1, %r5, L(loop0)
231C	depd		m096, 31, 32, ma064
232LDEF(0_out)
233	ldo		8(up), up
234	xmpyu		%fr8R, %fr4L, %fr22
235	xmpyu		%fr8L, %fr4R, %fr23
236	ldd		-0x78(%r30), p032a1
237	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
238	xmpyu		%fr8R, %fr4R, %fr24
239	xmpyu		%fr8L, %fr4L, %fr25
240	ldd		-0x70(%r30), p032a2
241	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
242	ldo		8(rp), rp
243	add		climb, p000a, s000
244	ldd		-0x80(%r30), p000a
245	fstd		%fr24, -0x80(%r30)	C low product to  -0x80..-0x79
246	add,dc		p064a, %r0, climb
247	ldd		-0x68(%r30), p064a
248	fstd		%fr25, -0x68(%r30)	C high product to -0x68..-0x61
249	add		ma000, s000, s000
250	add,dc		ma064, climb, climb
251	sub		r000, s000, s000
252	sub,db		%r0, climb, climb
253	sub		%r0, climb, climb
254	std		s000, -8(rp)
255	add		p032a1, p032a2, m032
256	add,dc		%r0, %r0, m096
257	depd,z		m032, 31, 32, ma000
258	extrd,u		m032, 31, 32, ma064
259	ldd		0(rp), r000
260	depd		m096, 31, 32, ma064
261LDEF(0_two_out)
262	ldd		-0x78(%r30), p032a1
263	ldd		-0x70(%r30), p032a2
264	ldo		8(rp), rp
265	add		climb, p000a, s000
266	ldd		-0x80(%r30), p000a
267	add,dc		p064a, %r0, climb
268	ldd		-0x68(%r30), p064a
269	add		ma000, s000, s000
270	add,dc		ma064, climb, climb
271	sub		r000, s000, s000
272	sub,db		%r0, climb, climb
273	sub		%r0, climb, climb
274	std		s000, -8(rp)
275LDEF(0_one_out)
276	add		p032a1, p032a2, m032
277	add,dc		%r0, %r0, m096
278	depd,z		m032, 31, 32, ma000
279	extrd,u		m032, 31, 32, ma064
280	ldd		0(rp), r000
281	depd		m096, 31, 32, ma064
282
283	add		climb, p000a, s000
284	add,dc		p064a, %r0, climb
285	add		ma000, s000, s000
286	add,dc		ma064, climb, climb
287	sub		r000, s000, s000
288	sub,db		%r0, climb, climb
289	sub		%r0, climb, climb
290	std		s000, 0(rp)
291
292	cmpib,>=	4, n, L(done)
293	ldo		8(rp), rp
294
295C 4-way unrolled code.
296
297LDEF(BIG)
298
299define(`p032a1',`%r1')	C
300define(`p032a2',`%r19')	C
301define(`p096b1',`%r20')	C
302define(`p096b2',`%r21')	C
303define(`p160c1',`%r22')	C
304define(`p160c2',`%r29')	C
305define(`p224d1',`%r31')	C
306define(`p224d2',`%r3')	C
307			C
308define(`m032',`%r4')	C
309define(`m096',`%r5')	C
310define(`m160',`%r6')	C
311define(`m224',`%r7')	C
312define(`m288',`%r8')	C
313			C
314define(`p000a',`%r1')	C
315define(`p064a',`%r19')	C
316define(`p064b',`%r20')	C
317define(`p128b',`%r21')	C
318define(`p128c',`%r22')	C
319define(`p192c',`%r29')	C
320define(`p192d',`%r31')	C
321define(`p256d',`%r3')	C
322			C
323define(`s000',`%r10')	C
324define(`s064',`%r11')	C
325define(`s128',`%r12')	C
326define(`s192',`%r13')	C
327			C
328define(`ma000',`%r9')	C
329define(`ma064',`%r4')	C
330define(`ma128',`%r5')	C
331define(`ma192',`%r6')	C
332define(`ma256',`%r7')	C
333			C
334define(`r000',`%r1')	C
335define(`r064',`%r19')	C
336define(`r128',`%r20')	C
337define(`r192',`%r21')	C
338
339	std		%r6, -0xe8(%r30)
340	std		%r7, -0xe0(%r30)
341	std		%r8, -0xd8(%r30)
342	std		%r9, -0xd0(%r30)
343	std		%r10, -0xc8(%r30)
344	std		%r11, -0xc0(%r30)
345	std		%r12, -0xb8(%r30)
346	std		%r13, -0xb0(%r30)
347
348ifdef(`HAVE_ABI_2_0w',
349`	extrd,u		n, 61, 62, n		C right shift 2
350',`	extrd,u		n, 61, 30, n		C right shift 2, zero extend
351')
352
353LDEF(4_or_more)
354	fldd		0(up), %fr4
355	fldd		8(up), %fr5
356	fldd		16(up), %fr6
357	fldd		24(up), %fr7
358	xmpyu		%fr8R, %fr4L, %fr22
359	xmpyu		%fr8L, %fr4R, %fr23
360	xmpyu		%fr8R, %fr5L, %fr24
361	xmpyu		%fr8L, %fr5R, %fr25
362	xmpyu		%fr8R, %fr6L, %fr26
363	xmpyu		%fr8L, %fr6R, %fr27
364	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
365	xmpyu		%fr8R, %fr7L, %fr28
366	xmpyu		%fr8L, %fr7R, %fr29
367	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
368	xmpyu		%fr8R, %fr4R, %fr30
369	xmpyu		%fr8L, %fr4L, %fr31
370	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
371	xmpyu		%fr8R, %fr5R, %fr22
372	xmpyu		%fr8L, %fr5L, %fr23
373	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
374	xmpyu		%fr8R, %fr6R, %fr24
375	xmpyu		%fr8L, %fr6L, %fr25
376	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
377	xmpyu		%fr8R, %fr7R, %fr26
378	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
379	addib,<>	-1, n, L(8_or_more)
380	xmpyu		%fr8L, %fr7L, %fr27
381	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
382	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
383	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
384	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
385	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
386	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
387	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
388	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
389	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
390	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
391	ldd		-0x78(%r30), p032a1
392	ldd		-0x70(%r30), p032a2
393	ldd		-0x38(%r30), p096b1
394	ldd		-0x30(%r30), p096b2
395	ldd		-0x58(%r30), p160c1
396	ldd		-0x50(%r30), p160c2
397	ldd		-0x18(%r30), p224d1
398	ldd		-0x10(%r30), p224d2
399	b		L(end1)
400	nop
401
402LDEF(8_or_more)
403	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
404	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
405	ldo		32(up), up
406	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
407	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
408	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
409	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
410	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
411	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
412	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
413	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
414	fldd		0(up), %fr4
415	fldd		8(up), %fr5
416	fldd		16(up), %fr6
417	fldd		24(up), %fr7
418	xmpyu		%fr8R, %fr4L, %fr22
419	ldd		-0x78(%r30), p032a1
420	xmpyu		%fr8L, %fr4R, %fr23
421	xmpyu		%fr8R, %fr5L, %fr24
422	ldd		-0x70(%r30), p032a2
423	xmpyu		%fr8L, %fr5R, %fr25
424	xmpyu		%fr8R, %fr6L, %fr26
425	ldd		-0x38(%r30), p096b1
426	xmpyu		%fr8L, %fr6R, %fr27
427	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
428	xmpyu		%fr8R, %fr7L, %fr28
429	ldd		-0x30(%r30), p096b2
430	xmpyu		%fr8L, %fr7R, %fr29
431	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
432	xmpyu		%fr8R, %fr4R, %fr30
433	ldd		-0x58(%r30), p160c1
434	xmpyu		%fr8L, %fr4L, %fr31
435	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
436	xmpyu		%fr8R, %fr5R, %fr22
437	ldd		-0x50(%r30), p160c2
438	xmpyu		%fr8L, %fr5L, %fr23
439	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
440	xmpyu		%fr8R, %fr6R, %fr24
441	ldd		-0x18(%r30), p224d1
442	xmpyu		%fr8L, %fr6L, %fr25
443	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
444	xmpyu		%fr8R, %fr7R, %fr26
445	ldd		-0x10(%r30), p224d2
446	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
447	addib,=		-1, n, L(end2)
448	xmpyu		%fr8L, %fr7L, %fr27
449LDEF(loop)
450	add		p032a1, p032a2, m032
451	ldd		-0x80(%r30), p000a
452	add,dc		p096b1, p096b2, m096
453	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
454
455	add,dc		p160c1, p160c2, m160
456	ldd		-0x68(%r30), p064a
457	add,dc		p224d1, p224d2, m224
458	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
459
460	add,dc		%r0, %r0, m288
461	ldd		-0x40(%r30), p064b
462	ldo		32(up), up
463	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
464
465	depd,z		m032, 31, 32, ma000
466	ldd		-0x28(%r30), p128b
467	extrd,u		m032, 31, 32, ma064
468	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
469
470	depd		m096, 31, 32, ma064
471	ldd		-0x60(%r30), p128c
472	extrd,u		m096, 31, 32, ma128
473	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
474
475	depd		m160, 31, 32, ma128
476	ldd		-0x48(%r30), p192c
477	extrd,u		m160, 31, 32, ma192
478	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
479
480	depd		m224, 31, 32, ma192
481	ldd		-0x20(%r30), p192d
482	extrd,u		m224, 31, 32, ma256
483	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
484
485	depd		m288, 31, 32, ma256
486	ldd		-0x88(%r30), p256d
487	add		climb, p000a, s000
488	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
489
490	add,dc		p064a, p064b, s064
491	ldd		0(rp), r000
492	add,dc		p128b, p128c, s128
493	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
494
495	add,dc		p192c, p192d, s192
496	ldd		8(rp), r064
497	add,dc		p256d, %r0, climb
498	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
499
500	ldd		16(rp), r128
501	add		ma000, s000, s000	C accum mid 0
502	ldd		24(rp), r192
503	add,dc		ma064, s064, s064	C accum mid 1
504
505	add,dc		ma128, s128, s128	C accum mid 2
506	fldd		0(up), %fr4
507	add,dc		ma192, s192, s192	C accum mid 3
508	fldd		8(up), %fr5
509
510	add,dc		ma256, climb, climb
511	fldd		16(up), %fr6
512	sub		r000, s000, s000	C accum rlimb 0
513	fldd		24(up), %fr7
514
515	sub,db		r064, s064, s064	C accum rlimb 1
516	sub,db		r128, s128, s128	C accum rlimb 2
517	std		s000, 0(rp)
518
519	sub,db		r192, s192, s192	C accum rlimb 3
520	sub,db		%r0, climb, climb
521	sub		%r0, climb, climb
522	std		s064, 8(rp)
523
524	xmpyu		%fr8R, %fr4L, %fr22
525	ldd		-0x78(%r30), p032a1
526	xmpyu		%fr8L, %fr4R, %fr23
527	std		s128, 16(rp)
528
529	xmpyu		%fr8R, %fr5L, %fr24
530	ldd		-0x70(%r30), p032a2
531	xmpyu		%fr8L, %fr5R, %fr25
532	std		s192, 24(rp)
533
534	xmpyu		%fr8R, %fr6L, %fr26
535	ldd		-0x38(%r30), p096b1
536	xmpyu		%fr8L, %fr6R, %fr27
537	fstd		%fr22, -0x78(%r30)	C mid product to  -0x78..-0x71
538
539	xmpyu		%fr8R, %fr7L, %fr28
540	ldd		-0x30(%r30), p096b2
541	xmpyu		%fr8L, %fr7R, %fr29
542	fstd		%fr23, -0x70(%r30)	C mid product to  -0x70..-0x69
543
544	xmpyu		%fr8R, %fr4R, %fr30
545	ldd		-0x58(%r30), p160c1
546	xmpyu		%fr8L, %fr4L, %fr31
547	fstd		%fr24, -0x38(%r30)	C mid product to  -0x38..-0x31
548
549	xmpyu		%fr8R, %fr5R, %fr22
550	ldd		-0x50(%r30), p160c2
551	xmpyu		%fr8L, %fr5L, %fr23
552	fstd		%fr25, -0x30(%r30)	C mid product to  -0x30..-0x29
553
554	xmpyu		%fr8R, %fr6R, %fr24
555	ldd		-0x18(%r30), p224d1
556	xmpyu		%fr8L, %fr6L, %fr25
557	fstd		%fr26, -0x58(%r30)	C mid product to  -0x58..-0x51
558
559	xmpyu		%fr8R, %fr7R, %fr26
560	ldd		-0x10(%r30), p224d2
561	fstd		%fr27, -0x50(%r30)	C mid product to  -0x50..-0x49
562	xmpyu		%fr8L, %fr7L, %fr27
563
564	addib,<>	-1, n, L(loop)
565	ldo		32(rp), rp
566
567LDEF(end2)
568	add		p032a1, p032a2, m032
569	ldd		-0x80(%r30), p000a
570	add,dc		p096b1, p096b2, m096
571	fstd		%fr28, -0x18(%r30)	C mid product to  -0x18..-0x11
572	add,dc		p160c1, p160c2, m160
573	ldd		-0x68(%r30), p064a
574	add,dc		p224d1, p224d2, m224
575	fstd		%fr29, -0x10(%r30)	C mid product to  -0x10..-0x09
576	add,dc		%r0, %r0, m288
577	ldd		-0x40(%r30), p064b
578	fstd		%fr30, -0x80(%r30)	C low product to  -0x80..-0x79
579	depd,z		m032, 31, 32, ma000
580	ldd		-0x28(%r30), p128b
581	extrd,u		m032, 31, 32, ma064
582	fstd		%fr31, -0x68(%r30)	C high product to -0x68..-0x61
583	depd		m096, 31, 32, ma064
584	ldd		-0x60(%r30), p128c
585	extrd,u		m096, 31, 32, ma128
586	fstd		%fr22, -0x40(%r30)	C low product to  -0x40..-0x39
587	depd		m160, 31, 32, ma128
588	ldd		-0x48(%r30), p192c
589	extrd,u		m160, 31, 32, ma192
590	fstd		%fr23, -0x28(%r30)	C high product to -0x28..-0x21
591	depd		m224, 31, 32, ma192
592	ldd		-0x20(%r30), p192d
593	extrd,u		m224, 31, 32, ma256
594	fstd		%fr24, -0x60(%r30)	C low product to  -0x60..-0x59
595	depd		m288, 31, 32, ma256
596	ldd		-0x88(%r30), p256d
597	add		climb, p000a, s000
598	fstd		%fr25, -0x48(%r30)	C high product to -0x48..-0x41
599	add,dc		p064a, p064b, s064
600	ldd		0(rp), r000
601	add,dc		p128b, p128c, s128
602	fstd		%fr26, -0x20(%r30)	C low product to  -0x20..-0x19
603	add,dc		p192c, p192d, s192
604	ldd		8(rp), r064
605	add,dc		p256d, %r0, climb
606	fstd		%fr27, -0x88(%r30)	C high product to -0x88..-0x81
607	ldd		16(rp), r128
608	add		ma000, s000, s000	C accum mid 0
609	ldd		24(rp), r192
610	add,dc		ma064, s064, s064	C accum mid 1
611	add,dc		ma128, s128, s128	C accum mid 2
612	add,dc		ma192, s192, s192	C accum mid 3
613	add,dc		ma256, climb, climb
614	sub		r000, s000, s000	C accum rlimb 0
615	sub,db		r064, s064, s064	C accum rlimb 1
616	sub,db		r128, s128, s128	C accum rlimb 2
617	std		s000, 0(rp)
618	sub,db		r192, s192, s192	C accum rlimb 3
619	sub,db		%r0, climb, climb
620	sub		%r0, climb, climb
621	std		s064, 8(rp)
622	ldd		-0x78(%r30), p032a1
623	std		s128, 16(rp)
624	ldd		-0x70(%r30), p032a2
625	std		s192, 24(rp)
626	ldd		-0x38(%r30), p096b1
627	ldd		-0x30(%r30), p096b2
628	ldd		-0x58(%r30), p160c1
629	ldd		-0x50(%r30), p160c2
630	ldd		-0x18(%r30), p224d1
631	ldd		-0x10(%r30), p224d2
632	ldo		32(rp), rp
633
634LDEF(end1)
635	add		p032a1, p032a2, m032
636	ldd		-0x80(%r30), p000a
637	add,dc		p096b1, p096b2, m096
638	add,dc		p160c1, p160c2, m160
639	ldd		-0x68(%r30), p064a
640	add,dc		p224d1, p224d2, m224
641	add,dc		%r0, %r0, m288
642	ldd		-0x40(%r30), p064b
643	depd,z		m032, 31, 32, ma000
644	ldd		-0x28(%r30), p128b
645	extrd,u		m032, 31, 32, ma064
646	depd		m096, 31, 32, ma064
647	ldd		-0x60(%r30), p128c
648	extrd,u		m096, 31, 32, ma128
649	depd		m160, 31, 32, ma128
650	ldd		-0x48(%r30), p192c
651	extrd,u		m160, 31, 32, ma192
652	depd		m224, 31, 32, ma192
653	ldd		-0x20(%r30), p192d
654	extrd,u		m224, 31, 32, ma256
655	depd		m288, 31, 32, ma256
656	ldd		-0x88(%r30), p256d
657	add		climb, p000a, s000
658	add,dc		p064a, p064b, s064
659	ldd		0(rp), r000
660	add,dc		p128b, p128c, s128
661	add,dc		p192c, p192d, s192
662	ldd		8(rp), r064
663	add,dc		p256d, %r0, climb
664	ldd		16(rp), r128
665	add		ma000, s000, s000	C accum mid 0
666	ldd		24(rp), r192
667	add,dc		ma064, s064, s064	C accum mid 1
668	add,dc		ma128, s128, s128	C accum mid 2
669	add,dc		ma192, s192, s192	C accum mid 3
670	add,dc		ma256, climb, climb
671	sub		r000, s000, s000	C accum rlimb 0
672	sub,db		r064, s064, s064	C accum rlimb 1
673	sub,db		r128, s128, s128	C accum rlimb 2
674	std		s000, 0(rp)
675	sub,db		r192, s192, s192	C accum rlimb 3
676	sub,db		%r0, climb, climb
677	sub		%r0, climb, climb
678	std		s064, 8(rp)
679	std		s128, 16(rp)
680	std		s192, 24(rp)
681
682	ldd		-0xb0(%r30), %r13
683	ldd		-0xb8(%r30), %r12
684	ldd		-0xc0(%r30), %r11
685	ldd		-0xc8(%r30), %r10
686	ldd		-0xd0(%r30), %r9
687	ldd		-0xd8(%r30), %r8
688	ldd		-0xe0(%r30), %r7
689	ldd		-0xe8(%r30), %r6
690LDEF(done)
691ifdef(`HAVE_ABI_2_0w',
692`	copy		climb, %r28
693',`	extrd,u		climb, 63, 32, %r29
694	extrd,u		climb, 31, 32, %r28
695')
696	ldd		-0xf0(%r30), %r5
697	ldd		-0xf8(%r30), %r4
698	bve		(%r2)
699	ldd,mb		-0x100(%r30), %r3
700EPILOGUE(mpn_submul_1)
701