xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/arm/v6/sqr_basecase.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  ARM v6 mpn_sqr_basecase.
2
3dnl  Contributed to the GNU project by Torbjörn Granlund.
4
5dnl  Copyright 2012, 2013, 2015 Free Software Foundation, Inc.
6
7dnl  This file is part of the GNU MP Library.
8dnl
9dnl  The GNU MP Library is free software; you can redistribute it and/or modify
10dnl  it under the terms of either:
11dnl
12dnl    * the GNU Lesser General Public License as published by the Free
13dnl      Software Foundation; either version 3 of the License, or (at your
14dnl      option) any later version.
15dnl
16dnl  or
17dnl
18dnl    * the GNU General Public License as published by the Free Software
19dnl      Foundation; either version 2 of the License, or (at your option) any
20dnl      later version.
21dnl
22dnl  or both in parallel, as here.
23dnl
24dnl  The GNU MP Library is distributed in the hope that it will be useful, but
25dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
27dnl  for more details.
28dnl
29dnl  You should have received copies of the GNU General Public License and the
30dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
31dnl  see https://www.gnu.org/licenses/.
32
33include(`../config.m4')
34
35C Code structure:
36C
37C
38C        m_2(0m4)        m_2(2m4)        m_2(1m4)        m_2(3m4)
39C           |               |               |               |
40C           |               |               |               |
41C           |               |               |               |
42C          \|/             \|/             \|/             \|/
43C              ____________                   ____________
44C             /            \                 /            \
45C            \|/            \               \|/            \
46C         am_2(3m4)       am_2(1m4)       am_2(0m4)       am_2(2m4)
47C            \            /|\                \            /|\
48C             \____________/                  \____________/
49C                       \                        /
50C                        \                      /
51C                         \                    /
52C                         cor3             cor2
53C                            \              /
54C                             \            /
55C                            sqr_diag_addlsh1
56
57C TODO
58C  * Align more labels.
59C  * Further tweak counter and updates in outer loops.  (This could save
60C    perhaps 5n cycles).
61C  * Avoid sub-with-lsl in outer loops.  We could keep n up-shifted, then
62C    initialise loop counter i with a right shift.
63C  * Try to use fewer register.  Perhaps coalesce r9 branch target and n_saved.
64C    (This could save 2-3 cycles for n > 4.)
65C  * Optimise sqr_diag_addlsh1 loop.  The current code uses old-style carry
66C    propagation.
67C  * Stop loops earlier suppressing writes of upper-most rp[] values.
68C  * The addmul_2 loops here runs well on all cores, but mul_2 runs poorly
69C    particularly on Cortex-A8.
70
71
72define(`rp',      r0)
73define(`up',      r1)
74define(`n',       r2)
75
76define(`v0',      r3)
77define(`v1',      r6)
78define(`i',       r8)
79define(`n_saved', r14)
80define(`cya',     r11)
81define(`cyb',     r12)
82define(`u0',      r7)
83define(`u1',      r9)
84
85ASM_START()
86PROLOGUE(mpn_sqr_basecase)
87	and	r12, n, #3
88	cmp	n, #4
89	addgt	r12, r12, #4
90	add	pc, pc, r12, lsl #2
91	nop
92	b	L(4)
93	b	L(1)
94	b	L(2)
95	b	L(3)
96	b	L(0m4)
97	b	L(1m4)
98	b	L(2m4)
99	b	L(3m4)
100
101
102L(1m4):	push	{r4-r11, r14}
103	mov	n_saved, n
104	sub	i, n, #4
105	sub	n, n, #2
106	add	r10, pc, #L(am2_2m4)-.-8
107	ldm	up, {v0,v1,u0}
108	sub	up, up, #4
109	mov	cyb, #0
110	mov	r5, #0
111	umull	r4, cya, v1, v0
112	str	r4, [rp], #-12
113	mov	r4, #0
114	b	L(ko0)
115
116L(3m4):	push	{r4-r11, r14}
117	mov	n_saved, n
118	sub	i, n, #4
119	sub	n, n, #2
120	add	r10, pc, #L(am2_0m4)-.-8
121	ldm	up, {v0,v1,u0}
122	add	up, up, #4
123	mov	cyb, #0
124	mov	r5, #0
125	umull	r4, cya, v1, v0
126	str	r4, [rp], #-4
127	mov	r4, #0
128	b	L(ko2)
129
130L(2m4):	push	{r4-r11, r14}
131	mov	n_saved, n
132	sub	i, n, #4
133	sub	n, n, #2
134	add	r10, pc, #L(am2_3m4)-.-8
135	ldm	up, {v0,v1,u1}
136	mov	cyb, #0
137	mov	r4, #0
138	umull	r5, cya, v1, v0
139	str	r5, [rp], #-8
140	mov	r5, #0
141	b	L(ko1)
142
143L(0m4):	push	{r4-r11, r14}
144	mov	n_saved, n
145	sub	i, n, #4
146	sub	n, n, #2
147	add	r10, pc, #L(am2_1m4)-.-8
148	ldm	up, {v0,v1,u1}
149	mov	cyb, #0
150	mov	r4, #0
151	add	up, up, #8
152	umull	r5, cya, v1, v0
153	str	r5, [rp, #0]
154	mov	r5, #0
155
156L(top):	ldr	u0, [up, #4]
157	umaal	r4, cya, u1, v0
158	str	r4, [rp, #4]
159	mov	r4, #0
160	umaal	r5, cyb, u1, v1
161L(ko2):	ldr	u1, [up, #8]
162	umaal	r5, cya, u0, v0
163	str	r5, [rp, #8]
164	mov	r5, #0
165	umaal	r4, cyb, u0, v1
166L(ko1):	ldr	u0, [up, #12]
167	umaal	r4, cya, u1, v0
168	str	r4, [rp, #12]
169	mov	r4, #0
170	umaal	r5, cyb, u1, v1
171L(ko0):	ldr	u1, [up, #16]!
172	umaal	r5, cya, u0, v0
173	str	r5, [rp, #16]!
174	mov	r5, #0
175	umaal	r4, cyb, u0, v1
176	subs	i, i, #4
177	bhi	L(top)
178
179	umaal	r4, cya, u1, v0
180	ldr	u0, [up, #4]
181	umaal	r5, cyb, u1, v1
182	str	r4, [rp, #4]
183	umaal	r5, cya, u0, v0
184	umaal	cya, cyb, u0, v1
185	str	r5, [rp, #8]
186	str	cya, [rp, #12]
187	str	cyb, [rp, #16]
188
189	add	up, up, #4
190	sub	n, n, #1
191	add	rp, rp, #8
192	bx	r10
193
194L(evnloop):
195	subs	i, n, #6
196	sub	n, n, #2
197	blt	L(cor2)
198	ldm	up, {v0,v1,u1}
199	add	up, up, #8
200	mov	cya, #0
201	mov	cyb, #0
202	ldr	r4, [rp, #-4]
203	umaal	r4, cya, v1, v0
204	str	r4, [rp, #-4]
205	ldr	r4, [rp, #0]
206
207	ALIGN(16)
208L(ua2):	ldr	r5, [rp, #4]
209	umaal	r4, cya, u1, v0
210	ldr	u0, [up, #4]
211	umaal	r5, cyb, u1, v1
212	str	r4, [rp, #0]
213	ldr	r4, [rp, #8]
214	umaal	r5, cya, u0, v0
215	ldr	u1, [up, #8]
216	umaal	r4, cyb, u0, v1
217	str	r5, [rp, #4]
218	ldr	r5, [rp, #12]
219	umaal	r4, cya, u1, v0
220	ldr	u0, [up, #12]
221	umaal	r5, cyb, u1, v1
222	str	r4, [rp, #8]
223	ldr	r4, [rp, #16]!
224	umaal	r5, cya, u0, v0
225	ldr	u1, [up, #16]!
226	umaal	r4, cyb, u0, v1
227	str	r5, [rp, #-4]
228	subs	i, i, #4
229	bhs	L(ua2)
230
231	umaal	r4, cya, u1, v0
232	umaal	cya, cyb, u1, v1
233	str	r4, [rp, #0]
234	str	cya, [rp, #4]
235	str	cyb, [rp, #8]
236L(am2_0m4):
237	sub	rp, rp, n, lsl #2
238	sub	up, up, n, lsl #2
239	add	rp, rp, #8
240
241	sub	i, n, #4
242	sub	n, n, #2
243	ldm	up, {v0,v1,u1}
244	mov	cya, #0
245	mov	cyb, #0
246	ldr	r4, [rp, #4]
247	umaal	r4, cya, v1, v0
248	str	r4, [rp, #4]
249	ldr	r4, [rp, #8]
250	b	L(lo0)
251
252	ALIGN(16)
253L(ua0):	ldr	r5, [rp, #4]
254	umaal	r4, cya, u1, v0
255	ldr	u0, [up, #4]
256	umaal	r5, cyb, u1, v1
257	str	r4, [rp, #0]
258	ldr	r4, [rp, #8]
259	umaal	r5, cya, u0, v0
260	ldr	u1, [up, #8]
261	umaal	r4, cyb, u0, v1
262	str	r5, [rp, #4]
263L(lo0):	ldr	r5, [rp, #12]
264	umaal	r4, cya, u1, v0
265	ldr	u0, [up, #12]
266	umaal	r5, cyb, u1, v1
267	str	r4, [rp, #8]
268	ldr	r4, [rp, #16]!
269	umaal	r5, cya, u0, v0
270	ldr	u1, [up, #16]!
271	umaal	r4, cyb, u0, v1
272	str	r5, [rp, #-4]
273	subs	i, i, #4
274	bhs	L(ua0)
275
276	umaal	r4, cya, u1, v0
277	umaal	cya, cyb, u1, v1
278	str	r4, [rp, #0]
279	str	cya, [rp, #4]
280	str	cyb, [rp, #8]
281L(am2_2m4):
282	sub	rp, rp, n, lsl #2
283	sub	up, up, n, lsl #2
284	add	rp, rp, #16
285	b	L(evnloop)
286
287
288L(oddloop):
289	sub	i, n, #5
290	sub	n, n, #2
291	ldm	up, {v0,v1,u0}
292	mov	cya, #0
293	mov	cyb, #0
294	ldr	r5, [rp, #0]
295	umaal	r5, cya, v1, v0
296	str	r5, [rp, #0]
297	ldr	r5, [rp, #4]
298	add	up, up, #4
299	b	L(lo1)
300
301	ALIGN(16)
302L(ua1):	ldr	r5, [rp, #4]
303	umaal	r4, cya, u1, v0
304	ldr	u0, [up, #4]
305	umaal	r5, cyb, u1, v1
306	str	r4, [rp, #0]
307L(lo1):	ldr	r4, [rp, #8]
308	umaal	r5, cya, u0, v0
309	ldr	u1, [up, #8]
310	umaal	r4, cyb, u0, v1
311	str	r5, [rp, #4]
312	ldr	r5, [rp, #12]
313	umaal	r4, cya, u1, v0
314	ldr	u0, [up, #12]
315	umaal	r5, cyb, u1, v1
316	str	r4, [rp, #8]
317	ldr	r4, [rp, #16]!
318	umaal	r5, cya, u0, v0
319	ldr	u1, [up, #16]!
320	umaal	r4, cyb, u0, v1
321	str	r5, [rp, #-4]
322	subs	i, i, #4
323	bhs	L(ua1)
324
325	umaal	r4, cya, u1, v0
326	umaal	cya, cyb, u1, v1
327	str	r4, [rp, #0]
328	str	cya, [rp, #4]
329	str	cyb, [rp, #8]
330L(am2_3m4):
331	sub	rp, rp, n, lsl #2
332	sub	up, up, n, lsl #2
333	add	rp, rp, #4
334
335	subs	i, n, #3
336	beq	L(cor3)
337	sub	n, n, #2
338	ldm	up, {v0,v1,u0}
339	mov	cya, #0
340	mov	cyb, #0
341	ldr	r5, [rp, #8]
342	sub	up, up, #4
343	umaal	r5, cya, v1, v0
344	str	r5, [rp, #8]
345	ldr	r5, [rp, #12]
346	b	L(lo3)
347
348	ALIGN(16)
349L(ua3):	ldr	r5, [rp, #4]
350	umaal	r4, cya, u1, v0
351	ldr	u0, [up, #4]
352	umaal	r5, cyb, u1, v1
353	str	r4, [rp, #0]
354	ldr	r4, [rp, #8]
355	umaal	r5, cya, u0, v0
356	ldr	u1, [up, #8]
357	umaal	r4, cyb, u0, v1
358	str	r5, [rp, #4]
359	ldr	r5, [rp, #12]
360	umaal	r4, cya, u1, v0
361	ldr	u0, [up, #12]
362	umaal	r5, cyb, u1, v1
363	str	r4, [rp, #8]
364L(lo3):	ldr	r4, [rp, #16]!
365	umaal	r5, cya, u0, v0
366	ldr	u1, [up, #16]!
367	umaal	r4, cyb, u0, v1
368	str	r5, [rp, #-4]
369	subs	i, i, #4
370	bhs	L(ua3)
371
372	umaal	r4, cya, u1, v0
373	umaal	cya, cyb, u1, v1
374	str	r4, [rp, #0]
375	str	cya, [rp, #4]
376	str	cyb, [rp, #8]
377L(am2_1m4):
378	sub	rp, rp, n, lsl #2
379	sub	up, up, n, lsl #2
380	add	rp, rp, #12
381	b	L(oddloop)
382
383
384L(cor3):ldm	up, {v0,v1,u0}
385	ldr	r5, [rp, #8]
386	mov	cya, #0
387	mov	cyb, #0
388	umaal	r5, cya, v1, v0
389	str	r5, [rp, #8]
390	ldr	r5, [rp, #12]
391	ldr	r4, [rp, #16]
392	umaal	r5, cya, u0, v0
393	ldr	u1, [up, #12]
394	umaal	r4, cyb, u0, v1
395	str	r5, [rp, #12]
396	umaal	r4, cya, u1, v0
397	umaal	cya, cyb, u1, v1
398	str	r4, [rp, #16]
399	str	cya, [rp, #20]
400	str	cyb, [rp, #24]
401	add	up, up, #16
402	mov	cya, cyb
403	adds	rp, rp, #36		C clear cy
404	mov	cyb, #0
405	umaal	cya, cyb, u1, u0
406	b	L(sqr_diag_addlsh1)
407
408L(cor2):
409	ldm	up!, {v0,v1,u0}
410	mov	r4, cya
411	mov	r5, cyb
412	mov	cya, #0
413	umaal	r4, cya, v1, v0
414	mov	cyb, #0
415	umaal	r5, cya, u0, v0
416	strd	r4, r5, [rp, #-4]
417	umaal	cya, cyb, u0, v1
418	add	rp, rp, #16
419C	b	L(sqr_diag_addlsh1)
420
421
422define(`w0',  r6)
423define(`w1',  r7)
424define(`w2',  r8)
425define(`rbx', r9)
426
427L(sqr_diag_addlsh1):
428	str	cya, [rp, #-12]
429	str	cyb, [rp, #-8]
430	sub	n, n_saved, #1
431	sub	up, up, n_saved, lsl #2
432	sub	rp, rp, n_saved, lsl #3
433	ldr	r3, [up], #4
434	umull	w1, r5, r3, r3
435	mov	w2, #0
436	mov	r10, #0
437C	cmn	r0, #0			C clear cy (already clear)
438	b	L(lm)
439
440L(tsd):	adds	w0, w0, rbx
441	adcs	w1, w1, r4
442	str	w0, [rp, #0]
443L(lm):	ldr	w0, [rp, #4]
444	str	w1, [rp, #4]
445	ldr	w1, [rp, #8]!
446	add	rbx, r5, w2
447	adcs	w0, w0, w0
448	ldr	r3, [up], #4
449	adcs	w1, w1, w1
450	adc	w2, r10, r10
451	umull	r4, r5, r3, r3
452	subs	n, n, #1
453	bne	L(tsd)
454
455	adds	w0, w0, rbx
456	adcs	w1, w1, r4
457	adc	w2, r5, w2
458	stm	rp, {w0,w1,w2}
459
460	pop	{r4-r11, pc}
461
462
463C Straight line code for n <= 4
464
465L(1):	ldr	r3, [up, #0]
466	umull	r1, r2, r3, r3
467	stm	rp, {r1,r2}
468	bx	r14
469
470L(2):	push	{r4-r5}
471	ldm	up, {r5,r12}
472	umull	r1, r2, r5, r5
473	umull	r3, r4, r12, r12
474	umull	r5, r12, r5, r12
475	adds	r5, r5, r5
476	adcs	r12, r12, r12
477	adc	r4, r4, #0
478	adds	r2, r2, r5
479	adcs	r3, r3, r12
480	adc	r4, r4, #0
481	stm	rp, {r1,r2,r3,r4}
482	pop	{r4-r5}
483	bx	r14
484
485L(3):	push	{r4-r11}
486	ldm	up, {r7,r8,r9}
487	umull	r1, r2, r7, r7
488	umull	r3, r4, r8, r8
489	umull	r5, r6, r9, r9
490	umull	r10, r11, r7, r8
491	mov	r12, #0
492	umlal	r11, r12, r7, r9
493	mov	r7, #0
494	umlal	r12, r7, r8, r9
495	adds	r10, r10, r10
496	adcs	r11, r11, r11
497	adcs	r12, r12, r12
498	adcs	r7, r7, r7
499	adc	r6, r6, #0
500	adds	r2, r2, r10
501	adcs	r3, r3, r11
502	adcs	r4, r4, r12
503	adcs	r5, r5, r7
504	adc	r6, r6, #0
505	stm	rp, {r1,r2,r3,r4,r5,r6}
506	pop	{r4-r11}
507	bx	r14
508
509L(4):	push	{r4-r11, r14}
510	ldm	up, {r9,r10,r11,r12}
511	umull	r1, r2, r9, r9
512	umull	r3, r4, r10, r10
513	umull	r5, r6, r11, r11
514	umull	r7, r8, r12, r12
515	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
516	umull	r1, r2, r9, r10
517	mov	r3, #0
518	umlal	r2, r3, r9, r11
519	mov	r4, #0
520	umlal	r3, r4, r9, r12
521	mov	r5, #0
522	umlal	r3, r5, r10, r11
523	umaal	r4, r5, r10, r12
524	mov	r6, #0
525	umlal	r5, r6, r11, r12
526	adds	r1, r1, r1
527	adcs	r2, r2, r2
528	adcs	r3, r3, r3
529	adcs	r4, r4, r4
530	adcs	r5, r5, r5
531	adcs	r6, r6, r6
532	add	rp, rp, #4
533	adc	r7, r8, #0
534	ldm	rp, {r8,r9,r10,r11,r12,r14}
535	adds	r1, r1, r8
536	adcs	r2, r2, r9
537	adcs	r3, r3, r10
538	adcs	r4, r4, r11
539	adcs	r5, r5, r12
540	adcs	r6, r6, r14
541	adc	r7, r7, #0
542	stm	rp, {r1,r2,r3,r4,r5,r6,r7}
543	pop	{r4-r11, pc}
544EPILOGUE()
545