xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm/aorslsh1_n.asm (revision 41f3ac3e09f0c1c4d8b911b4c8a1d6450bd14f46)
1dnl  ARM mpn_addlsh1_n and mpn_sublsh1_n
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2012 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	      addlsh1_n       sublsh1_n
36C	     cycles/limb     cycles/limb
37C StrongARM	 ?		 ?
38C XScale	 ?		 ?
39C Cortex-A7	 ?		 ?
40C Cortex-A8	 ?		 ?
41C Cortex-A9	 3.12		 3.7
42C Cortex-A15	 ?		 ?
43
44C TODO
45C  * The addlsh1_n code runs well, but is only barely faster than mpn_addmul_1.
46C    The sublsh1_n code could surely be tweaked, its REVCY slows down things
47C    very much.  If two insns are really needed, it might help to separate them
48C    for better micro-parallelism.
49
50define(`rp', `r0')
51define(`up', `r1')
52define(`vp', `r2')
53define(`n',  `r3')
54
55ifdef(`OPERATION_addlsh1_n', `
56  define(`ADDSUB',	adds)
57  define(`ADDSUBC',	adcs)
58  define(`SETCY',	`cmp	$1, #1')
59  define(`RETVAL',	`adc	r0, $1, #2')
60  define(`SAVECY',	`sbc	$1, $2, #0')
61  define(`RESTCY',	`cmn	$1, #1')
62  define(`REVCY',	`')
63  define(`INICYR',	`mov	$1, #0')
64  define(`r10r11',	`r11')
65  define(`func',	mpn_addlsh1_n)
66  define(`func_nc',	mpn_addlsh1_nc)')
67ifdef(`OPERATION_sublsh1_n', `
68  define(`ADDSUB',	subs)
69  define(`ADDSUBC',	sbcs)
70  define(`SETCY',	`rsbs	$1, $1, #0')
71  define(`RETVAL',	`adc	r0, $1, #1')
72  define(`SAVECY',	`sbc	$1, $1, $1')
73  define(`RESTCY',	`cmn	$1, #1')
74  define(`REVCY',	`sbc	$1, $1, $1
75			cmn	$1, #1')
76  define(`INICYR',	`mvn	$1, #0')
77  define(`r10r11',	`r10')
78  define(`func',	mpn_sublsh1_n)
79  define(`func_nc',	mpn_sublsh1_nc)')
80
81MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
82
83ASM_START()
84PROLOGUE(func)
85	push	{r4-r10r11, r14}
86
87ifdef(`OPERATION_addlsh1_n', `
88	mvn	r11, #0
89')
90	INICYR(	r14)
91	subs	n, n, #3
92	blt	L(le2)			C carry clear on branch path
93
94	cmn	r0, #0			C clear carry
95	ldmia	vp!, {r8, r9, r10}
96	b	L(mid)
97
98L(top):	RESTCY(	r14)
99	ADDSUBC	r4, r4, r8
100	ADDSUBC	r5, r5, r9
101	ADDSUBC	r6, r6, r10
102	ldmia	vp!, {r8, r9, r10}
103	stmia	rp!, {r4, r5, r6}
104	REVCY(r14)
105	adcs	r8, r8, r8
106	adcs	r9, r9, r9
107	adcs	r10, r10, r10
108	ldmia	up!, {r4, r5, r6}
109	SAVECY(	r14, r11)
110	subs	n, n, #3
111	blt	L(exi)
112	RESTCY(	r12)
113	ADDSUBC	r4, r4, r8
114	ADDSUBC	r5, r5, r9
115	ADDSUBC	r6, r6, r10
116	ldmia	vp!, {r8, r9, r10}
117	stmia	rp!, {r4, r5, r6}
118	REVCY(r12)
119L(mid):	adcs	r8, r8, r8
120	adcs	r9, r9, r9
121	adcs	r10, r10, r10
122	ldmia	up!, {r4, r5, r6}
123	SAVECY(	r12, r11)
124	subs	n, n, #3
125	bge	L(top)
126
127	mov	r7, r12			C swap alternating...
128	mov	r12, r14		C ...carry-save...
129	mov	r14, r7			C ...registers
130
131L(exi):	RESTCY(	r12)
132	ADDSUBC	r4, r4, r8
133	ADDSUBC	r5, r5, r9
134	ADDSUBC	r6, r6, r10
135	stmia	rp!, {r4, r5, r6}
136
137	REVCY(r12)
138L(le2):	tst	n, #1			C n = {-1,-2,-3} map to [2], [1], [0]
139	beq	L(e1)
140
141L(e02):	tst	n, #2
142	beq	L(rt0)
143	ldm	vp, {r8, r9}
144	adcs	r8, r8, r8
145	adcs	r9, r9, r9
146	ldm	up, {r4, r5}
147	SAVECY(	r12, r11)
148	RESTCY(	r14)
149	ADDSUBC	r4, r4, r8
150	ADDSUBC	r5, r5, r9
151	stm	rp, {r4, r5}
152	b	L(rt1)
153
154L(e1):	ldr	r8, [vp]
155	adcs	r8, r8, r8
156	ldr	r4, [up]
157	SAVECY(	r12, r11)
158	RESTCY(	r14)
159	ADDSUBC	r4, r4, r8
160	str	r4, [rp]
161
162L(rt1):	mov	r14, r12
163	REVCY(r12)
164L(rt0):	RETVAL(	r14)
165	pop	{r4-r10r11, r14}
166	return	r14
167EPILOGUE()
168