xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/alpha/com.asm (revision af56d1fe9956bd7c616e18c1b7f025f464618471)
1dnl  Alpha mpn_com -- mpn one's complement.
2
3dnl  Copyright 2003 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C      cycles/limb
24C EV4:    4.75
25C EV5:    2.0
26C EV6:    1.5
27
28
29C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
30C
31C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total
32C 2.0 c/l.  In general, a pattern like this unrolled to N limbs per loop
33C will be 1.5+2/N c/l.
34C
35C 2 cycles of loop control are unavoidable, for pointer updates and the
36C taken branch bubble, but also since ldq cannot issue two cycles after stq
37C (and with a run of stqs that means neither of two cycles at the end of the
38C loop.
39C
40C The fbeq is forced into the second cycle of the loop using unops, since
41C the first time through it must wait for the cvtqt result.  Once that
42C result is ready (a 1 cycle stall) then both the branch and following loads
43C can issue together.
44C
45C The main loop handles an odd count of limbs, being two limbs loaded before
46C each size test, plus one pipelined around from the previous iteration (or
47C setup in the entry sequence).
48C
49C An even number of limbs is handled by an explicit dst[0]=~src[0] in the
50C entry sequence, and an increment of the pointers.  For an odd size there's
51C no increment and the first store in the loop (r24) is a repeat of dst[0].
52C
53C Note that the load for r24 after the possible pointer increment is done
54C before the explicit store to dst[0], in case src==dst.
55
56
57ASM_START()
58
59FLOAT64(L(dat), 2.0)
60
61	ALIGN(16)
62
63PROLOGUE(mpn_com,gp)
64
65	C r16	dst
66	C r17	src
67	C r18	size
68
69	lda	r30, -16(r30)		C temporary stack space
70	lda	r7, -3(r18)		C size - 3
71
72	ldq	r20, 0(r17)		C src[0]
73	srl	r7, 1, r6		C (size-3)/2
74
75	stq	r6, 8(r30)		C (size-3)/2
76	and	r7, 1, r5		C 1 if size even
77
78	LEA(	r8, L(dat))
79	s8addq	r5, r17, r17		C skip src[0] if even
80
81	ornot	r31, r20, r20		C ~src[0]
82	unop
83
84	ldt	f0, 8(r30)		C (size-3)/2
85	ldq	r24, 0(r17)		C src[0 or 1]
86
87	stq	r20, 0(r16)		C dst[0]
88	s8addq	r5, r16, r19		C skip dst[0] if even
89
90	ldt	f1, 0(r8)		C data 2.0
91	lda	r30, 16(r30)		C restore stack
92	unop
93	cvtqt	f0, f0			C (size-3)/2 as float
94
95	ornot	r31, r24, r24
96	blt	r7, L(done_1)		C if size<=2
97	unop
98	unop
99
100
101	C 16-byte alignment here
102L(top):
103	C r17	src, incrementing
104	C r19	dst, incrementing
105	C r24	dst[i] result, ready to store
106	C f0	(size-3)/2, decrementing
107	C f1	2.0
108
109	ldq	r20, 8(r17)		C src[i+1]
110	ldq	r21, 16(r17)		C src[i+2]
111	unop
112	unop
113
114	fbeq	f0, L(done_2)
115	unop
116	ldq	r22, 24(r17)		C src[i+3]
117	ldq	r23, 32(r17)		C src[i+4]
118
119	stq	r24, 0(r19)		C dst[i]
120	ornot	r31, r20, r20
121	subt	f0, f1, f0		C count -= 2
122	unop
123
124	stq	r20, 8(r19)		C dst[i+1]
125	ornot	r31, r21, r21
126	unop
127	unop
128
129	stq	r21, 16(r19)		C dst[i+2]
130	ornot	r31, r22, r22
131
132	stq	r22, 24(r19)		C dst[i+3]
133	ornot	r31, r23, r24
134
135	lda	r17, 32(r17)		C src += 4
136	lda	r19, 32(r19)		C dst += 4
137	unop
138	fbge	f0, L(top)
139
140
141L(done_1):
142	C r19	&dst[size-1]
143	C r24	result for dst[size-1]
144
145	stq	r24, 0(r19)		C dst[size-1]
146	ret	r31, (r26), 1
147
148
149L(done_2):
150	C r19	&dst[size-3]
151	C r20	src[size-2]
152	C r21	src[size-1]
153	C r24	result for dst[size-3]
154
155	stq	r24, 0(r19)		C dst[size-3]
156	ornot	r31, r20, r20
157
158	stq	r20, 8(r19)		C dst[size-2]
159	ornot	r31, r21, r21
160
161	stq	r21, 16(r19)		C dst[size-1]
162	ret	r31, (r26), 1
163
164EPILOGUE()
165ASM_END()
166