xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/com.asm (revision 32d1c65c71fbdb65a012e8392a62a757dd6853e9)
1dnl  Alpha mpn_com -- mpn one's complement.
2
3dnl  Copyright 2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C      cycles/limb
35C EV4:    4.75
36C EV5:    2.0
37C EV6:    1.5
38
39
40C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
41C
42C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
43C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
44C will be 1.5+2/N c/l.
45C
46C 2 cycles of loop control are unavoidable, for pointer updates and the
47C taken branch bubble, but also since ldq cannot issue two cycles after stq
48C (and with a run of stqs that means neither of two cycles at the end of the
49C loop.
50C
51C The fbeq is forced into the second cycle of the loop using unops, since
52C the first time through it must wait for the cvtqt result.  Once that
53C result is ready (a 1 cycle stall) then both the branch and following loads
54C can issue together.
55C
56C The main loop handles an odd count of limbs, being two limbs loaded before
57C each size test, plus one pipelined around from the previous iteration (or
58C setup in the entry sequence).
59C
60C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
61C entry sequence, and an increment of the pointers.  For an odd size there's
62C no increment and the first store in the loop (r24) is a repeat of dst[0].
63C
64C Note that the load for r24 after the possible pointer increment is done
65C before the explicit store to dst[0], in case src==dst.
66
67
68ASM_START()
69
70FLOAT64(L(dat), 2.0)
71
72	ALIGN(16)
73
74PROLOGUE(mpn_com,gp)
75
76	C r16	dst
77	C r17	src
78	C r18	size
79
80	lda	r30, -16(r30)		C temporary stack space
81	lda	r7, -3(r18)		C size - 3
82
83	ldq	r20, 0(r17)		C src[0]
84	srl	r7, 1, r6		C (size-3)/2
85
86	stq	r6, 8(r30)		C (size-3)/2
87	and	r7, 1, r5		C 1 if size even
88
89	LEA(	r8, L(dat))
90	s8addq	r5, r17, r17		C skip src[0] if even
91
92	ornot	r31, r20, r20		C ~src[0]
93	unop
94
95	ldt	f0, 8(r30)		C (size-3)/2
96	ldq	r24, 0(r17)		C src[0 or 1]
97
98	stq	r20, 0(r16)		C dst[0]
99	s8addq	r5, r16, r19		C skip dst[0] if even
100
101	ldt	f1, 0(r8)		C data 2.0
102	lda	r30, 16(r30)		C restore stack
103	unop
104	cvtqt	f0, f0			C (size-3)/2 as float
105
106	ornot	r31, r24, r24
107	blt	r7, L(done_1)		C if size<=2
108	unop
109	unop
110
111
112	C 16-byte alignment here
113L(top):
114	C r17	src, incrementing
115	C r19	dst, incrementing
116	C r24	dst[i] result, ready to store
117	C f0	(size-3)/2, decrementing
118	C f1	2.0
119
120	ldq	r20, 8(r17)		C src[i+1]
121	ldq	r21, 16(r17)		C src[i+2]
122	unop
123	unop
124
125	fbeq	f0, L(done_2)
126	unop
127	ldq	r22, 24(r17)		C src[i+3]
128	ldq	r23, 32(r17)		C src[i+4]
129
130	stq	r24, 0(r19)		C dst[i]
131	ornot	r31, r20, r20
132	subt	f0, f1, f0		C count -= 2
133	unop
134
135	stq	r20, 8(r19)		C dst[i+1]
136	ornot	r31, r21, r21
137	unop
138	unop
139
140	stq	r21, 16(r19)		C dst[i+2]
141	ornot	r31, r22, r22
142
143	stq	r22, 24(r19)		C dst[i+3]
144	ornot	r31, r23, r24
145
146	lda	r17, 32(r17)		C src += 4
147	lda	r19, 32(r19)		C dst += 4
148	unop
149	fbge	f0, L(top)
150
151
152L(done_1):
153	C r19	&dst[size-1]
154	C r24	result for dst[size-1]
155
156	stq	r24, 0(r19)		C dst[size-1]
157	ret	r31, (r26), 1
158
159
160L(done_2):
161	C r19	&dst[size-3]
162	C r20	src[size-2]
163	C r21	src[size-1]
164	C r24	result for dst[size-3]
165
166	stq	r24, 0(r19)		C dst[size-3]
167	ornot	r31, r20, r20
168
169	stq	r20, 8(r19)		C dst[size-2]
170	ornot	r31, r21, r21
171
172	stq	r21, 16(r19)		C dst[size-1]
173	ret	r31, (r26), 1
174
175EPILOGUE()
176ASM_END()
177