xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/powerpc64/mode64/aors_n.asm (revision a45db23f655e22f0c2354600d3b3c2cb98abf2dc)
1dnl  PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
2
3dnl  Copyright 1999-2001, 2003-2005, 2007, 2011 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33C                   cycles/limb
34C POWER3/PPC630          1.5
35C POWER4/PPC970          2
36C POWER5                 2
37C POWER6                 2.63
38C POWER7               2.25-2.87
39
40C This code is a little bit slower for POWER3/PPC630 than the simple code used
41C previously, but it is much faster for POWER4/PPC970.  The reason for the
42C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4
43C registers.
44
45C INPUT PARAMETERS
46C rp	r3
47C up	r4
48C vp	r5
49C n	r6
50
51ifdef(`OPERATION_add_n',`
52  define(ADDSUBC,	adde)
53  define(ADDSUB,	addc)
54  define(func,		mpn_add_n)
55  define(func_nc,	mpn_add_nc)
56  define(GENRVAL,	`addi	r3, r3, 1')
57  define(SETCBR,	`addic	r0, $1, -1')
58  define(CLRCB,		`addic	r0, r0, 0')
59')
60ifdef(`OPERATION_sub_n',`
61  define(ADDSUBC,	subfe)
62  define(ADDSUB,	subfc)
63  define(func,		mpn_sub_n)
64  define(func_nc,	mpn_sub_nc)
65  define(GENRVAL,	`neg	r3, r3')
66  define(SETCBR,	`subfic	r0, $1, 0')
67  define(CLRCB,		`addic	r0, r1, -1')
68')
69
70MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
71
72ASM_START()
73PROLOGUE(func_nc)
74	SETCBR(r7)
75	b	L(ent)
76EPILOGUE()
77
78PROLOGUE(func)
79	CLRCB
80L(ent):	std	r31, -8(r1)
81	std	r30, -16(r1)
82	std	r29, -24(r1)
83	std	r28, -32(r1)
84
85	rldicl.	r0, r6, 0,62	C r0 = n & 3, set cr0
86	cmpdi	cr6, r0, 2
87	addi	r6, r6, 3	C compute count...
88	srdi	r6, r6, 2	C ...for ctr
89	mtctr	r6		C copy count into ctr
90	beq	cr0, L(b00)
91	blt	cr6, L(b01)
92	beq	cr6, L(b10)
93
94L(b11):	ld	r8, 0(r4)	C load s1 limb
95	ld	r9, 0(r5)	C load s2 limb
96	ld	r10, 8(r4)	C load s1 limb
97	ld	r11, 8(r5)	C load s2 limb
98	ld	r12, 16(r4)	C load s1 limb
99	addi	r4, r4, 24
100	ld	r0, 16(r5)	C load s2 limb
101	addi	r5, r5, 24
102	ADDSUBC	r29, r9, r8
103	ADDSUBC	r30, r11, r10
104	ADDSUBC	r31, r0, r12
105	std	r29, 0(r3)
106	std	r30, 8(r3)
107	std	r31, 16(r3)
108	addi	r3, r3, 24
109	bdnz	L(go)
110	b	L(ret)
111
112L(b01):	ld	r12, 0(r4)	C load s1 limb
113	addi	r4, r4, 8
114	ld	r0, 0(r5)	C load s2 limb
115	addi	r5, r5, 8
116	ADDSUBC	r31, r0, r12	C add
117	std	r31, 0(r3)
118	addi	r3, r3, 8
119	bdnz	L(go)
120	b	L(ret)
121
122L(b10):	ld	r10, 0(r4)	C load s1 limb
123	ld	r11, 0(r5)	C load s2 limb
124	ld	r12, 8(r4)	C load s1 limb
125	addi	r4, r4, 16
126	ld	r0, 8(r5)	C load s2 limb
127	addi	r5, r5, 16
128	ADDSUBC	r30, r11, r10	C add
129	ADDSUBC	r31, r0, r12	C add
130	std	r30, 0(r3)
131	std	r31, 8(r3)
132	addi	r3, r3, 16
133	bdnz	L(go)
134	b	L(ret)
135
136L(b00):	C INITCY		C clear/set cy
137L(go):	ld	r6, 0(r4)	C load s1 limb
138	ld	r7, 0(r5)	C load s2 limb
139	ld	r8, 8(r4)	C load s1 limb
140	ld	r9, 8(r5)	C load s2 limb
141	ld	r10, 16(r4)	C load s1 limb
142	ld	r11, 16(r5)	C load s2 limb
143	ld	r12, 24(r4)	C load s1 limb
144	ld	r0, 24(r5)	C load s2 limb
145	bdz	L(end)
146
147	addi	r4, r4, 32
148	addi	r5, r5, 32
149
150	ALIGN(16)
151L(top):	ADDSUBC	r28, r7, r6
152	ld	r6, 0(r4)	C load s1 limb
153	ld	r7, 0(r5)	C load s2 limb
154	ADDSUBC	r29, r9, r8
155	ld	r8, 8(r4)	C load s1 limb
156	ld	r9, 8(r5)	C load s2 limb
157	ADDSUBC	r30, r11, r10
158	ld	r10, 16(r4)	C load s1 limb
159	ld	r11, 16(r5)	C load s2 limb
160	ADDSUBC	r31, r0, r12
161	ld	r12, 24(r4)	C load s1 limb
162	ld	r0, 24(r5)	C load s2 limb
163	std	r28, 0(r3)
164	addi	r4, r4, 32
165	std	r29, 8(r3)
166	addi	r5, r5, 32
167	std	r30, 16(r3)
168	std	r31, 24(r3)
169	addi	r3, r3, 32
170	bdnz	L(top)		C decrement ctr and loop back
171
172L(end):	ADDSUBC	r28, r7, r6
173	ADDSUBC	r29, r9, r8
174	ADDSUBC	r30, r11, r10
175	ADDSUBC	r31, r0, r12
176	std	r28, 0(r3)
177	std	r29, 8(r3)
178	std	r30, 16(r3)
179	std	r31, 24(r3)
180
181L(ret):	ld	r31, -8(r1)
182	ld	r30, -16(r1)
183	ld	r29, -24(r1)
184	ld	r28, -32(r1)
185
186	subfe	r3, r0, r0	C -cy
187	GENRVAL
188	blr
189EPILOGUE()
190