xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/aors_n.asm (revision 7c192b2a5e1093666e67801684f930ef49b3b363)
1dnl  PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
2
3dnl  Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007, 2011 Free Software
4dnl  Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of the GNU Lesser General Public License as published
10dnl  by the Free Software Foundation; either version 3 of the License, or (at
11dnl  your option) any later version.
12
13dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16dnl  License for more details.
17
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23C                   cycles/limb
24C POWER3/PPC630          1.5
25C POWER4/PPC970          2
26C POWER5                 2
27C POWER6                 2.63
28C POWER7               2.25-2.87
29
30C This code is a little bit slower for POWER3/PPC630 than the simple code used
31C previously, but it is much faster for POWER4/PPC970.  The reason for the
32C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
33C registers.
34
35C INPUT PARAMETERS
36C rp	r3
37C up	r4
38C vp	r5
39C n	r6
40
41ifdef(`OPERATION_add_n',`
42  define(ADDSUBC,	adde)
43  define(ADDSUB,	addc)
44  define(func,		mpn_add_n)
45  define(func_nc,	mpn_add_nc)
46  define(GENRVAL,	`addi	r3, r3, 1')
47  define(SETCBR,	`addic	r0, $1, -1')
48  define(CLRCB,		`addic	r0, r0, 0')
49')
50ifdef(`OPERATION_sub_n',`
51  define(ADDSUBC,	subfe)
52  define(ADDSUB,	subfc)
53  define(func,		mpn_sub_n)
54  define(func_nc,	mpn_sub_nc)
55  define(GENRVAL,	`neg	r3, r3')
56  define(SETCBR,	`subfic	r0, $1, 0')
57  define(CLRCB,		`addic	r0, r1, -1')
58')
59
60MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
61
62ASM_START()
63PROLOGUE(func_nc)
64	SETCBR(r7)
65	b	L(ent)
66EPILOGUE()
67
68PROLOGUE(func)
69	CLRCB
70L(ent):	std	r31, -8(r1)
71	std	r30, -16(r1)
72	std	r29, -24(r1)
73	std	r28, -32(r1)
74
75	rldicl.	r0, r6, 0,62	C r0 = n & 3, set cr0
76	cmpdi	cr6, r0, 2
77	addi	r6, r6, 3	C compute count...
78	srdi	r6, r6, 2	C ...for ctr
79	mtctr	r6		C copy count into ctr
80	beq	cr0, L(b00)
81	blt	cr6, L(b01)
82	beq	cr6, L(b10)
83
84L(b11):	ld	r8, 0(r4)	C load s1 limb
85	ld	r9, 0(r5)	C load s2 limb
86	ld	r10, 8(r4)	C load s1 limb
87	ld	r11, 8(r5)	C load s2 limb
88	ld	r12, 16(r4)	C load s1 limb
89	addi	r4, r4, 24
90	ld	r0, 16(r5)	C load s2 limb
91	addi	r5, r5, 24
92	ADDSUBC	r29, r9, r8
93	ADDSUBC	r30, r11, r10
94	ADDSUBC	r31, r0, r12
95	std	r29, 0(r3)
96	std	r30, 8(r3)
97	std	r31, 16(r3)
98	addi	r3, r3, 24
99	bdnz	L(go)
100	b	L(ret)
101
102L(b01):	ld	r12, 0(r4)	C load s1 limb
103	addi	r4, r4, 8
104	ld	r0, 0(r5)	C load s2 limb
105	addi	r5, r5, 8
106	ADDSUBC	r31, r0, r12	C add
107	std	r31, 0(r3)
108	addi	r3, r3, 8
109	bdnz	L(go)
110	b	L(ret)
111
112L(b10):	ld	r10, 0(r4)	C load s1 limb
113	ld	r11, 0(r5)	C load s2 limb
114	ld	r12, 8(r4)	C load s1 limb
115	addi	r4, r4, 16
116	ld	r0, 8(r5)	C load s2 limb
117	addi	r5, r5, 16
118	ADDSUBC	r30, r11, r10	C add
119	ADDSUBC	r31, r0, r12	C add
120	std	r30, 0(r3)
121	std	r31, 8(r3)
122	addi	r3, r3, 16
123	bdnz	L(go)
124	b	L(ret)
125
126L(b00):	C INITCY		C clear/set cy
127L(go):	ld	r6, 0(r4)	C load s1 limb
128	ld	r7, 0(r5)	C load s2 limb
129	ld	r8, 8(r4)	C load s1 limb
130	ld	r9, 8(r5)	C load s2 limb
131	ld	r10, 16(r4)	C load s1 limb
132	ld	r11, 16(r5)	C load s2 limb
133	ld	r12, 24(r4)	C load s1 limb
134	ld	r0, 24(r5)	C load s2 limb
135	bdz	L(end)
136
137	addi	r4, r4, 32
138	addi	r5, r5, 32
139
140	ALIGN(16)
141L(top):	ADDSUBC	r28, r7, r6
142	ld	r6, 0(r4)	C load s1 limb
143	ld	r7, 0(r5)	C load s2 limb
144	ADDSUBC	r29, r9, r8
145	ld	r8, 8(r4)	C load s1 limb
146	ld	r9, 8(r5)	C load s2 limb
147	ADDSUBC	r30, r11, r10
148	ld	r10, 16(r4)	C load s1 limb
149	ld	r11, 16(r5)	C load s2 limb
150	ADDSUBC	r31, r0, r12
151	ld	r12, 24(r4)	C load s1 limb
152	ld	r0, 24(r5)	C load s2 limb
153	std	r28, 0(r3)
154	addi	r4, r4, 32
155	std	r29, 8(r3)
156	addi	r5, r5, 32
157	std	r30, 16(r3)
158	std	r31, 24(r3)
159	addi	r3, r3, 32
160	bdnz	L(top)		C decrement ctr and loop back
161
162L(end):	ADDSUBC	r28, r7, r6
163	ADDSUBC	r29, r9, r8
164	ADDSUBC	r30, r11, r10
165	ADDSUBC	r31, r0, r12
166	std	r28, 0(r3)
167	std	r29, 8(r3)
168	std	r30, 16(r3)
169	std	r31, 24(r3)
170
171L(ret):	ld	r31, -8(r1)
172	ld	r30, -16(r1)
173	ld	r29, -24(r1)
174	ld	r28, -32(r1)
175
176	subfe	r3, r0, r0	C -cy
177	GENRVAL
178	blr
179EPILOGUE()
180