xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/aors_n.asm (revision d25ffa98a4bfca1fe272f3c182496ec9934faac7)
1dnl  PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
2
3dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C		cycles/limb
24C POWER3/PPC630:     1.5
25C POWER4/PPC970:     2
26
27C   n	   POWER3/PPC630   POWER4/PPC970
28C     1	       17.00	       19.00
29C     2		9.00	       10.49
30C     3		5.33		7.66
31C     4		4.50		5.14
32C     5		4.20		4.80
33C     6		3.83		4.33
34C     7		3.00		3.99
35C     8		2.87		3.55
36C     9		2.89		3.40
37C    10		2.60		3.42
38C    11		2.45		3.15
39C    12		2.41		2.99
40C    13		2.46		3.01
41C    14		2.42		2.97
42C    15		2.20		2.85
43C    50		1.78		2.44
44C   100		1.83		2.20
45C   200		1.55		2.12
46C   400		1.53		2.05
47C  1000		1.98		2.02#
48C  2000		1.50#		2.04
49C  4000		2.55		2.50
50C  8000		2.70		2.45
51C 16000		2.65		5.94
52C 32000		2.62	       16.41
53C 64000		2.73	       18.94
54
55C This code is a little bit slower for POWER3/PPC630 than the simple code used
56C previously, but it is much faster for POWER4/PPC970.  The reason for the
57C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
58C registers.
59
60C INPUT PARAMETERS
61C rp	r3
62C up	r4
63C vp	r5
64C n	r6
65
66ifdef(`OPERATION_add_n',`
67  define(ADDSUBC,	adde)
68  define(ADDSUB,	addc)
69  define(func,		mpn_add_n)
70  define(func_nc,	mpn_add_nc)
71  define(GENRVAL,	`addi	r3, r3, 1')
72  define(SETCBR,	`addic	r0, $1, -1')
73  define(CLRCB,		`addic	r0, r0, 0')
74')
75ifdef(`OPERATION_sub_n',`
76  define(ADDSUBC,	subfe)
77  define(ADDSUB,	subfc)
78  define(func,		mpn_sub_n)
79  define(func_nc,	mpn_sub_nc)
80  define(GENRVAL,	`neg	r3, r3')
81  define(SETCBR,	`subfic	r0, $1, 0')
82  define(CLRCB,		`addic	r0, r1, -1')
83')
84
85MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
86
87ASM_START()
88PROLOGUE(func_nc)
89	SETCBR(r7)
90	b	L(ent)
91EPILOGUE()
92
93PROLOGUE(func)
94	CLRCB
95L(ent):	std	r31, -8(r1)
96	std	r30, -16(r1)
97	std	r29, -24(r1)
98	std	r28, -32(r1)
99
100	rldicl.	r0, r6, 0,62	C r0 = n & 3, set cr0
101	cmpdi	cr6, r0, 2
102	addi	r6, r6, 3	C compute count...
103	srdi	r6, r6, 2	C ...for ctr
104	mtctr	r6		C copy count into ctr
105	beq	cr0, L(b00)
106	blt	cr6, L(b01)
107	beq	cr6, L(b10)
108
109L(b11):	ld	r8, 0(r4)	C load s1 limb
110	ld	r9, 0(r5)	C load s2 limb
111	ld	r10, 8(r4)	C load s1 limb
112	ld	r11, 8(r5)	C load s2 limb
113	ld	r12, 16(r4)	C load s1 limb
114	addi	r4, r4, 24
115	ld	r0, 16(r5)	C load s2 limb
116	addi	r5, r5, 24
117	ADDSUBC	r29, r9, r8
118	ADDSUBC	r30, r11, r10
119	ADDSUBC	r31, r0, r12
120	std	r29, 0(r3)
121	std	r30, 8(r3)
122	std	r31, 16(r3)
123	addi	r3, r3, 24
124	bdnz	L(go)
125	b	L(ret)
126
127L(b01):	ld	r12, 0(r4)	C load s1 limb
128	addi	r4, r4, 8
129	ld	r0, 0(r5)	C load s2 limb
130	addi	r5, r5, 8
131	ADDSUBC	r31, r0, r12	C add
132	std	r31, 0(r3)
133	addi	r3, r3, 8
134	bdnz	L(go)
135	b	L(ret)
136
137L(b10):	ld	r10, 0(r4)	C load s1 limb
138	ld	r11, 0(r5)	C load s2 limb
139	ld	r12, 8(r4)	C load s1 limb
140	addi	r4, r4, 16
141	ld	r0, 8(r5)	C load s2 limb
142	addi	r5, r5, 16
143	ADDSUBC	r30, r11, r10	C add
144	ADDSUBC	r31, r0, r12	C add
145	std	r30, 0(r3)
146	std	r31, 8(r3)
147	addi	r3, r3, 16
148	bdnz	L(go)
149	b	L(ret)
150
151L(b00):	C INITCY		C clear/set cy
152L(go):	ld	r6, 0(r4)	C load s1 limb
153	ld	r7, 0(r5)	C load s2 limb
154	ld	r8, 8(r4)	C load s1 limb
155	ld	r9, 8(r5)	C load s2 limb
156	ld	r10, 16(r4)	C load s1 limb
157	ld	r11, 16(r5)	C load s2 limb
158	ld	r12, 24(r4)	C load s1 limb
159	ld	r0, 24(r5)	C load s2 limb
160	bdz	L(end)
161
162	addi	r4, r4, 32
163	addi	r5, r5, 32
164
165L(oop):	ADDSUBC	r28, r7, r6
166	ld	r6, 0(r4)	C load s1 limb
167	ld	r7, 0(r5)	C load s2 limb
168	ADDSUBC	r29, r9, r8
169	ld	r8, 8(r4)	C load s1 limb
170	ld	r9, 8(r5)	C load s2 limb
171	ADDSUBC	r30, r11, r10
172	ld	r10, 16(r4)	C load s1 limb
173	ld	r11, 16(r5)	C load s2 limb
174	ADDSUBC	r31, r0, r12
175	ld	r12, 24(r4)	C load s1 limb
176	ld	r0, 24(r5)	C load s2 limb
177	std	r28, 0(r3)
178	addi	r4, r4, 32
179	std	r29, 8(r3)
180	addi	r5, r5, 32
181	std	r30, 16(r3)
182	std	r31, 24(r3)
183	addi	r3, r3, 32
184	bdnz	L(oop)		C decrement ctr and loop back
185
186L(end):	ADDSUBC	r28, r7, r6
187	ADDSUBC	r29, r9, r8
188	ADDSUBC	r30, r11, r10
189	ADDSUBC	r31, r0, r12
190	std	r28, 0(r3)
191	std	r29, 8(r3)
192	std	r30, 16(r3)
193	std	r31, 24(r3)
194
195L(ret):	ld	r31, -8(r1)
196	ld	r30, -16(r1)
197	ld	r29, -24(r1)
198	ld	r28, -32(r1)
199
200	subfe	r3, r0, r0	C -cy
201	GENRVAL
202	blr
203EPILOGUE()
204