xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/pa32/hppa1_1/mul_1.asm (revision 4391d5e9d4f291db41e3b3ba26a01b5e51364aae)
1dnl  HP-PA 1.1 mpn_mul_1 -- Multiply a limb vector with a limb and store the
2dnl  result in a second limb vector.
3
4dnl  Copyright 1992, 1993, 1994, 2000, 2001, 2002 Free Software Foundation,
5dnl  Inc.
6
7dnl  This file is part of the GNU MP Library.
8
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of the GNU Lesser General Public License as published
11dnl  by the Free Software Foundation; either version 3 of the License, or (at
12dnl  your option) any later version.
13
14dnl  The GNU MP Library is distributed in the hope that it will be useful, but
15dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
17dnl  License for more details.
18
19dnl  You should have received a copy of the GNU Lesser General Public License
20dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
21
22include(`../config.m4')
23
24C INPUT PARAMETERS
25C res_ptr	r26
26C s1_ptr	r25
27C size		r24
28C s2_limb	r23
29
30C This runs at 9 cycles/limb on a PA7000.  With the used instructions, it can
31C not become faster due to data cache contention after a store.  On the PA7100
32C it runs at 7 cycles/limb.
33
34C We could use fldds to read two limbs at a time from the S1 array, and that
35C could bring down the times to 8.5 and 6.5 cycles/limb for the PA7000 and
36C PA7100, respectively.  We don't do that since it does not seem worth the
37C (alignment) troubles...
38
39C At least the PA7100 is rumored to be able to deal with cache-misses without
40C stalling instruction issue.  If this is true, and the cache is actually also
41C lockup-free, we should use a deeper software pipeline, and load from S1 very
42C early!  (The loads and stores to -12(sp) will surely be in the cache.)
43
44ASM_START()
45PROLOGUE(mpn_mul_1)
46C	.callinfo	frame=64,no_calls
47
48	ldo		64(%r30),%r30
49	fldws,ma	4(%r25),%fr5
50	stw		%r23,-16(%r30)		C move s2_limb ...
51	addib,=		-1,%r24,L(just_one_limb)
52	 fldws		-16(%r30),%fr4		C ... into fr4
53	add		%r0,%r0,%r0		C clear carry
54	xmpyu		%fr4,%fr5,%fr6
55	fldws,ma	4(%r25),%fr7
56	fstds		%fr6,-16(%r30)
57	xmpyu		%fr4,%fr7,%fr8
58	ldw		-12(%r30),%r19		C least significant limb in product
59	ldw		-16(%r30),%r28
60
61	fstds		%fr8,-16(%r30)
62	addib,=		-1,%r24,L(end)
63	 ldw		-12(%r30),%r1
64
65C Main loop
66LDEF(loop)
67	fldws,ma	4(%r25),%fr5
68	stws,ma		%r19,4(%r26)
69	addc		%r28,%r1,%r19
70	xmpyu		%fr4,%fr5,%fr6
71	ldw		-16(%r30),%r28
72	fstds		%fr6,-16(%r30)
73	addib,<>	-1,%r24,L(loop)
74	 ldw		-12(%r30),%r1
75
76LDEF(end)
77	stws,ma		%r19,4(%r26)
78	addc		%r28,%r1,%r19
79	ldw		-16(%r30),%r28
80	stws,ma		%r19,4(%r26)
81	addc		%r0,%r28,%r28
82	bv		0(%r2)
83	 ldo		-64(%r30),%r30
84
85LDEF(just_one_limb)
86	xmpyu		%fr4,%fr5,%fr6
87	fstds		%fr6,-16(%r30)
88	ldw		-16(%r30),%r28
89	ldo		-64(%r30),%r30
90	bv		0(%r2)
91	 fstws		%fr6R,0(%r26)
92EPILOGUE()
93