xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/sparc32/v9/add_n.asm (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1dnl  SPARC mpn_add_n -- Add two limb vectors of the same length > 0 and store
2dnl  sum in a third limb vector.
3
4dnl  Copyright 2001 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32
33include(`../config.m4')
34
35C INPUT PARAMETERS
36define(rp,%o0)
37define(s1p,%o1)
38define(s2p,%o2)
39define(n,%o3)
40define(cy,%g1)
41
42C This code uses 64-bit operations on `o' and `g' registers.  It doesn't
43C require that `o' registers' upper 32 bits are preserved by the operating
44C system, but if they are not, they must be zeroed.  That is indeed what
45C happens at least on Slowaris 2.5 and 2.6.
46
47C On UltraSPARC 1 and 2, this code runs at 3 cycles/limb from the Dcache and at
48C about 10 cycles/limb from the Ecache.
49
50ASM_START()
51PROLOGUE(mpn_add_n)
52	lduw	[s1p+0],%o4
53	lduw	[s2p+0],%o5
54	addcc	n,-2,n
55	bl,pn	%icc,L(end1)
56	lduw	[s1p+4],%g2
57	lduw	[s2p+4],%g3
58	be,pn	%icc,L(end2)
59	mov	0,cy
60
61	.align	16
62L(loop):
63	add	%o4,%o5,%g4
64	add	rp,8,rp
65	lduw	[s1p+8],%o4
66	fitod	%f0,%f2
67C ---
68	add	cy,%g4,%g4
69	addcc	n,-1,n
70	lduw	[s2p+8],%o5
71	fitod	%f0,%f2
72C ---
73	srlx	%g4,32,cy
74	add	s2p,8,s2p
75	stw	%g4,[rp-8]
76	be,pn	%icc,L(exito)+4
77C ---
78	add	%g2,%g3,%g4
79	addcc	n,-1,n
80	lduw	[s1p+12],%g2
81	fitod	%f0,%f2
82C ---
83	add	cy,%g4,%g4
84	add	s1p,8,s1p
85	lduw	[s2p+4],%g3
86	fitod	%f0,%f2
87C ---
88	srlx	%g4,32,cy
89	bne,pt	%icc,L(loop)
90	stw	%g4,[rp-4]
91C ---
92L(exite):
93	add	%o4,%o5,%g4
94	add	cy,%g4,%g4
95	srlx	%g4,32,cy
96	stw	%g4,[rp+0]
97	add	%g2,%g3,%g4
98	add	cy,%g4,%g4
99	stw	%g4,[rp+4]
100	retl
101	srlx	%g4,32,%o0
102
103L(exito):
104	add	%g2,%g3,%g4
105	add	cy,%g4,%g4
106	srlx	%g4,32,cy
107	stw	%g4,[rp-4]
108	add	%o4,%o5,%g4
109	add	cy,%g4,%g4
110	stw	%g4,[rp+0]
111	retl
112	srlx	%g4,32,%o0
113
114L(end1):
115	add	%o4,%o5,%g4
116	stw	%g4,[rp+0]
117	retl
118	srlx	%g4,32,%o0
119
120L(end2):
121	add	%o4,%o5,%g4
122	srlx	%g4,32,cy
123	stw	%g4,[rp+0]
124	add	%g2,%g3,%g4
125	add	cy,%g4,%g4
126	stw	%g4,[rp+4]
127	retl
128	srlx	%g4,32,%o0
129EPILOGUE(mpn_add_n)
130