xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm/neon/lorrshift.asm (revision 72c7faa4dbb41dbb0238d6b4a109da0d4b236dd4)
1dnl  ARM Neon mpn_lshift and mpn_rshift.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2013 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C	     cycles/limb     cycles/limb     cycles/limb      good
36C              aligned	      unaligned	      best seen	     for cpu?
37C StrongARM	 -		 -
38C XScale	 -		 -
39C Cortex-A7	 ?		 ?
40C Cortex-A8	 ?		 ?
41C Cortex-A9	 3		 3				Y
42C Cortex-A15	 1.5		 1.5				Y
43
44
45C We read 64 bits at a time at 32-bit aligned addresses, and except for the
46C first and last store, we write using 64-bit aligned addresses.  All shifting
47C is done on 64-bit words in 'extension' registers.
48C
49C It should be possible to read also using 64-bit alignment, by manipulating
50C the shift count for unaligned operands.  Not done, since it does not seem to
51C matter for A9 or A15.
52C
53C This will not work in big-endian mode.
54
55C TODO
56C  * Try using 128-bit operations.  Note that Neon lacks pure 128-bit shifts,
57C    which might make it tricky.
58C  * Clean up and simplify.
59C  * Consider sharing most of the code for lshift and rshift, since the feed-in
60C    code, the loop, and most of the wind-down code are identical.
61C  * Replace the basecase code with code using 'extension' registers.
62C  * Optimise.  It is not clear that this loop insn permutation is optimal for
63C    either A9 or A15.
64
65C INPUT PARAMETERS
66define(`rp',  `r0')
67define(`ap',  `r1')
68define(`n',   `r2')
69define(`cnt', `r3')
70
71ifdef(`OPERATION_lshift',`
72	define(`IFLSH', `$1')
73	define(`IFRSH', `')
74	define(`X',`0')
75	define(`Y',`1')
76	define(`func',`mpn_lshift')
77')
78ifdef(`OPERATION_rshift',`
79	define(`IFLSH', `')
80	define(`IFRSH', `$1')
81	define(`X',`1')
82	define(`Y',`0')
83	define(`func',`mpn_rshift')
84')
85
86MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
87
88ASM_START(neon)
89	TEXT
90	ALIGN(64)
91PROLOGUE(func)
92IFLSH(`	mov	r12, n, lsl #2	')
93IFLSH(`	add	rp, rp, r12	')
94IFLSH(`	add	ap, ap, r12	')
95
96	cmp	n, #4			C SIMD code n limit
97	ble	L(base)
98
99ifdef(`OPERATION_lshift',`
100	vdup.32	d6, r3			C left shift count is positive
101	sub	r3, r3, #64		C right shift count is negative
102	vdup.32	d7, r3
103	mov	r12, #-8')		C lshift pointer update offset
104ifdef(`OPERATION_rshift',`
105	rsb	r3, r3, #0		C right shift count is negative
106	vdup.32	d6, r3
107	add	r3, r3, #64		C left shift count is positive
108	vdup.32	d7, r3
109	mov	r12, #8')		C rshift pointer update offset
110
111IFLSH(`	sub	ap, ap, #8	')
112	vld1.32	{d19}, [ap], r12	C load initial 2 limbs
113	vshl.u64 d18, d19, d7		C retval
114
115	tst	rp, #4			C is rp 64-bit aligned already?
116	beq	L(rp_aligned)		C yes, skip
117IFLSH(`	add	ap, ap, #4	')	C move back ap pointer
118IFRSH(`	sub	ap, ap, #4	')	C move back ap pointer
119	vshl.u64 d4, d19, d6
120	sub	n, n, #1		C first limb handled
121IFLSH(`	sub	 rp, rp, #4	')
122	vst1.32	 {d4[Y]}, [rp]IFRSH(!)	C store first limb, rp gets aligned
123	vld1.32	 {d19}, [ap], r12	C load ap[1] and ap[2]
124
125L(rp_aligned):
126IFLSH(`	sub	rp, rp, #8	')
127	subs	n, n, #6
128	blt	L(two_or_three_more)
129	tst	n, #2
130	beq	L(2)
131
132L(1):	vld1.32	 {d17}, [ap], r12
133	vshl.u64 d5, d19, d6
134	vld1.32	 {d16}, [ap], r12
135	vshl.u64 d0, d17, d7
136	vshl.u64 d4, d17, d6
137	sub	n, n, #2
138	b	 L(mid)
139
140L(2):	vld1.32	 {d16}, [ap], r12
141	vshl.u64 d4, d19, d6
142	vld1.32	 {d17}, [ap], r12
143	vshl.u64 d1, d16, d7
144	vshl.u64 d5, d16, d6
145	subs	n, n, #4
146	blt	L(end)
147
148L(top):	vld1.32	 {d16}, [ap], r12
149	vorr	 d2, d4, d1
150	vshl.u64 d0, d17, d7
151	vshl.u64 d4, d17, d6
152	vst1.32	 {d2}, [rp:64], r12
153L(mid):	vld1.32	 {d17}, [ap], r12
154	vorr	 d3, d5, d0
155	vshl.u64 d1, d16, d7
156	vshl.u64 d5, d16, d6
157	vst1.32	 {d3}, [rp:64], r12
158	subs	n, n, #4
159	bge	L(top)
160
161L(end):	tst	 n, #1
162	beq	 L(evn)
163
164	vorr	 d2, d4, d1
165	vst1.32	 {d2}, [rp:64], r12
166	b	 L(cj1)
167
168L(evn):	vorr	 d2, d4, d1
169	vshl.u64 d0, d17, d7
170	vshl.u64 d16, d17, d6
171	vst1.32	 {d2}, [rp:64], r12
172	vorr	 d2, d5, d0
173	b	 L(cj2)
174
175C Load last 2 - 3 limbs, store last 4 - 5 limbs
176L(two_or_three_more):
177	tst	n, #1
178	beq	L(l2)
179
180L(l3):	vshl.u64 d5, d19, d6
181	vld1.32	 {d17}, [ap], r12
182L(cj1):	veor	 d16, d16, d16
183IFLSH(`	add	 ap, ap, #4	')
184	vld1.32	 {d16[Y]}, [ap], r12
185	vshl.u64 d0, d17, d7
186	vshl.u64 d4, d17, d6
187	vorr	 d3, d5, d0
188	vshl.u64 d1, d16, d7
189	vshl.u64 d5, d16, d6
190	vst1.32	 {d3}, [rp:64], r12
191	vorr	 d2, d4, d1
192	vst1.32	 {d2}, [rp:64], r12
193IFLSH(`	add	 rp, rp, #4	')
194	vst1.32	 {d5[Y]}, [rp]
195	vmov.32	 r0, d18[X]
196	bx	lr
197
198L(l2):	vld1.32	 {d16}, [ap], r12
199	vshl.u64 d4, d19, d6
200	vshl.u64 d1, d16, d7
201	vshl.u64 d16, d16, d6
202	vorr	 d2, d4, d1
203L(cj2):	vst1.32	 {d2}, [rp:64], r12
204	vst1.32	 {d16}, [rp]
205	vmov.32	 r0, d18[X]
206	bx	lr
207
208
209define(`tnc', `r12')
210L(base):
211	push	{r4, r6, r7, r8}
212ifdef(`OPERATION_lshift',`
213	ldr	r4, [ap, #-4]!
214	rsb	tnc, cnt, #32
215
216	mov	r7, r4, lsl cnt
217	tst	n, #1
218	beq	L(ev)			C n even
219
220L(od):	subs	n, n, #2
221	bcc	L(ed1)			C n = 1
222	ldr	r8, [ap, #-4]!
223	b	L(md)			C n = 3
224
225L(ev):	ldr	r6, [ap, #-4]!
226	subs	n, n, #2
227	beq	L(ed)			C n = 3
228					C n = 4
229L(tp):	ldr	r8, [ap, #-4]!
230	orr	r7, r7, r6, lsr tnc
231	str	r7, [rp, #-4]!
232	mov	r7, r6, lsl cnt
233L(md):	ldr	r6, [ap, #-4]!
234	orr	r7, r7, r8, lsr tnc
235	str	r7, [rp, #-4]!
236	mov	r7, r8, lsl cnt
237
238L(ed):	orr	r7, r7, r6, lsr tnc
239	str	r7, [rp, #-4]!
240	mov	r7, r6, lsl cnt
241L(ed1):	str	r7, [rp, #-4]
242	mov	r0, r4, lsr tnc
243')
244ifdef(`OPERATION_rshift',`
245	ldr	r4, [ap]
246	rsb	tnc, cnt, #32
247
248	mov	r7, r4, lsr cnt
249	tst	n, #1
250	beq	L(ev)			C n even
251
252L(od):	subs	n, n, #2
253	bcc	L(ed1)			C n = 1
254	ldr	r8, [ap, #4]!
255	b	L(md)			C n = 3
256
257L(ev):	ldr	r6, [ap, #4]!
258	subs	n, n, #2
259	beq	L(ed)			C n = 2
260					C n = 4
261
262L(tp):	ldr	r8, [ap, #4]!
263	orr	r7, r7, r6, lsl tnc
264	str	r7, [rp], #4
265	mov	r7, r6, lsr cnt
266L(md):	ldr	r6, [ap, #4]!
267	orr	r7, r7, r8, lsl tnc
268	str	r7, [rp], #4
269	mov	r7, r8, lsr cnt
270
271L(ed):	orr	r7, r7, r6, lsl tnc
272	str	r7, [rp], #4
273	mov	r7, r6, lsr cnt
274L(ed1):	str	r7, [rp], #4
275	mov	r0, r4, lsl tnc
276')
277	pop	{r4, r6, r7, r8}
278	bx	r14
279EPILOGUE()
280