xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa32/hppa1_1/submul_1.asm (revision 9ddb6ab554e70fb9bbd90c3d96b812bc57755a14)
1dnl  HP-PA 1.1 mpn_submul_1 -- Multiply a limb vector with a limb and subtract
2dnl  the result from a second limb vector.
3
4dnl  Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
5dnl  Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C INPUT PARAMETERS
25C res_ptr	r26
26C s1_ptr	r25
27C size		r24
28C s2_limb	r23
29
30C This runs at 12 cycles/limb on a PA7000.  With the used instructions, it can
31C not become faster due to data cache contention after a store.  On the PA7100
32C it runs at 11 cycles/limb.
33
34C There are some ideas described in mul_1.asm that applies to this code too.
35
36C It seems possible to make this run as fast as mpn_addmul_1, if we use
37C	sub,>>=	%r29,%r19,%r22
38C	addi	1,%r28,%r28
39C but that requires reworking the hairy software pipeline...
40
41ASM_START()
42PROLOGUE(mpn_submul_1)
43C	.callinfo	frame=64,no_calls
44
45	ldo		64(%r30),%r30
46	fldws,ma	4(%r25),%fr5
47	stw		%r23,-16(%r30)		C move s2_limb ...
48	addib,=		-1,%r24,L(just_one_limb)
49	 fldws		-16(%r30),%fr4		C ... into fr4
50	add		%r0,%r0,%r0		C clear carry
51	xmpyu		%fr4,%fr5,%fr6
52	fldws,ma	4(%r25),%fr7
53	fstds		%fr6,-16(%r30)
54	xmpyu		%fr4,%fr7,%fr8
55	ldw		-12(%r30),%r19		C least significant limb in product
56	ldw		-16(%r30),%r28
57
58	fstds		%fr8,-16(%r30)
59	addib,=		-1,%r24,L(end)
60	 ldw		-12(%r30),%r1
61
62C Main loop
63LDEF(loop)
64	ldws		0(%r26),%r29
65	fldws,ma	4(%r25),%fr5
66	sub		%r29,%r19,%r22
67	add		%r22,%r19,%r0
68	stws,ma		%r22,4(%r26)
69	addc		%r28,%r1,%r19
70	xmpyu		%fr4,%fr5,%fr6
71	ldw		-16(%r30),%r28
72	fstds		%fr6,-16(%r30)
73	addc		%r0,%r28,%r28
74	addib,<>	-1,%r24,L(loop)
75	 ldw		-12(%r30),%r1
76
77LDEF(end)
78	ldw		0(%r26),%r29
79	sub		%r29,%r19,%r22
80	add		%r22,%r19,%r0
81	stws,ma		%r22,4(%r26)
82	addc		%r28,%r1,%r19
83	ldw		-16(%r30),%r28
84	ldws		0(%r26),%r29
85	addc		%r0,%r28,%r28
86	sub		%r29,%r19,%r22
87	add		%r22,%r19,%r0
88	stws,ma		%r22,4(%r26)
89	addc		%r0,%r28,%r28
90	bv		0(%r2)
91	 ldo		-64(%r30),%r30
92
93LDEF(just_one_limb)
94	xmpyu		%fr4,%fr5,%fr6
95	ldw		0(%r26),%r29
96	fstds		%fr6,-16(%r30)
97	ldw		-12(%r30),%r1
98	ldw		-16(%r30),%r28
99	sub		%r29,%r1,%r22
100	add		%r22,%r1,%r0
101	stw		%r22,0(%r26)
102	addc		%r0,%r28,%r28
103	bv		0(%r2)
104	 ldo		-64(%r30),%r30
105EPILOGUE()
106