xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa32/hppa1_1/addmul_1.asm (revision 9aa0541bdf64142d9a27c2cf274394d60182818f)
1dnl  HP-PA 1.1 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2dnl  result to a second limb vector.
3
4dnl  Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
5dnl  Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C INPUT PARAMETERS
25C res_ptr	r26
26C s1_ptr	r25
27C size		r24
28C s2_limb	r23
29
30C This runs at 11 cycles/limb on a PA7000.  With the used instructions, it can
31C not become faster due to data cache contention after a store.  On the PA7100
32C it runs at 10 cycles/limb.
33
34C There are some ideas described in mul_1.asm that applies to this code too.
35
36ASM_START()
37PROLOGUE(mpn_addmul_1)
38C	.callinfo	frame=64,no_calls
39
40	ldo		64(%r30),%r30
41	fldws,ma	4(%r25),%fr5
42	stw		%r23,-16(%r30)		C move s2_limb ...
43	addib,=		-1,%r24,L(just_one_limb)
44	 fldws		-16(%r30),%fr4		C ... into fr4
45	add		%r0,%r0,%r0		C clear carry
46	xmpyu		%fr4,%fr5,%fr6
47	fldws,ma	4(%r25),%fr7
48	fstds		%fr6,-16(%r30)
49	xmpyu		%fr4,%fr7,%fr8
50	ldw		-12(%r30),%r19		C least significant limb in product
51	ldw		-16(%r30),%r28
52
53	fstds		%fr8,-16(%r30)
54	addib,=		-1,%r24,L(end)
55	 ldw		-12(%r30),%r1
56
57C Main loop
58LDEF(loop)
59	ldws		0(%r26),%r29
60	fldws,ma	4(%r25),%fr5
61	add		%r29,%r19,%r19
62	stws,ma		%r19,4(%r26)
63	addc		%r28,%r1,%r19
64	xmpyu		%fr4,%fr5,%fr6
65	ldw		-16(%r30),%r28
66	fstds		%fr6,-16(%r30)
67	addc		%r0,%r28,%r28
68	addib,<>	-1,%r24,L(loop)
69	 ldw		-12(%r30),%r1
70
71LDEF(end)
72	ldw		0(%r26),%r29
73	add		%r29,%r19,%r19
74	stws,ma		%r19,4(%r26)
75	addc		%r28,%r1,%r19
76	ldw		-16(%r30),%r28
77	ldws		0(%r26),%r29
78	addc		%r0,%r28,%r28
79	add		%r29,%r19,%r19
80	stws,ma		%r19,4(%r26)
81	addc		%r0,%r28,%r28
82	bv		0(%r2)
83	 ldo		-64(%r30),%r30
84
85LDEF(just_one_limb)
86	xmpyu		%fr4,%fr5,%fr6
87	ldw		0(%r26),%r29
88	fstds		%fr6,-16(%r30)
89	ldw		-12(%r30),%r1
90	ldw		-16(%r30),%r28
91	add		%r29,%r1,%r19
92	stw		%r19,0(%r26)
93	addc		%r0,%r28,%r28
94	bv		0(%r2)
95	 ldo		-64(%r30),%r30
96EPILOGUE()
97