xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/p6/copyd.asm (revision 479d8f7d843cc1b22d497efdf1f27a50ee8418d4)
1dnl  Intel P6 mpn_copyd -- copy limb vector backwards.
2
3dnl  Copyright 2001, 2002 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C P6: 1.75 cycles/limb, or 0.75 if no overlap
24
25
26C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
27C
28C An explicit loop is used because a decrementing rep movsl is a bit slow at
29C 2.4 c/l.  That rep movsl also has about a 40 cycle startup time, and the
30C code here stands a chance of being faster if the branches predict well.
31C
32C The slightly strange loop form seems necessary for the claimed speed.
33C Maybe load/store ordering affects it.
34C
35C The source and destination are checked to see if they're actually
36C overlapping, since it might be possible to use an incrementing rep movsl
37C at 0.75 c/l.  (It doesn't suffer the bad startup time of the decrementing
38C version.)
39C
40C Enhancements:
41C
42C Top speed for an all-integer copy is probably 1.0 c/l, being one load and
43C one store each cycle.  Unrolling the loop below would approach 1.0, but
44C it'd be good to know why something like store/load/subl + store/load/jnz
45C doesn't already run at 1.0 c/l.  It looks like it should decode in 2
46C cycles, but doesn't run that way.
47
48defframe(PARAM_SIZE,12)
49defframe(PARAM_SRC, 8)
50defframe(PARAM_DST, 4)
51
52dnl  re-using parameter space
53define(SAVE_ESI,`PARAM_SIZE')
54define(SAVE_EDI,`PARAM_SRC')
55
56	TEXT
57	ALIGN(16)
58
59PROLOGUE(mpn_copyd)
60deflit(`FRAME',0)
61
62	movl	PARAM_SIZE, %ecx
63
64	movl	%esi, SAVE_ESI
65	movl	PARAM_SRC, %esi
66
67	movl	%edi, SAVE_EDI
68	movl	PARAM_DST, %edi
69
70	subl	$1, %ecx
71	jb	L(zero)
72
73	movl	(%esi,%ecx,4), %eax		C src[size-1]
74	jz	L(one)
75
76	movl	-4(%esi,%ecx,4), %edx		C src[size-2]
77	subl	$2, %ecx
78	jbe	L(done_loop)			C 2 or 3 limbs only
79
80
81	C The usual overlap is
82	C
83	C     high                   low
84	C     +------------------+
85	C     |               dst|
86	C     +------------------+
87	C           +------------------+
88	C           |               src|
89	C           +------------------+
90	C
91	C We can use an incrementing copy in the following circumstances.
92	C
93	C     src+4*size<=dst, since then the regions are disjoint
94	C
95	C     src==dst, clearly (though this shouldn't occur normally)
96	C
97	C     src>dst, since in that case it's a requirement of the
98	C              parameters that src>=dst+size*4, and hence the
99	C              regions are disjoint
100	C
101
102	leal	(%edi,%ecx,4), %edx
103	cmpl	%edi, %esi
104	jae	L(use_movsl)		C src >= dst
105
106	cmpl	%edi, %edx
107	movl	4(%esi,%ecx,4), %edx	C src[size-2] again
108	jbe	L(use_movsl)		C src+4*size <= dst
109
110
111L(top):
112	C eax	prev high limb
113	C ebx
114	C ecx	counter, size-3 down to 0 or -1, inclusive, by 2s
115	C edx	prev low limb
116	C esi	src
117	C edi	dst
118	C ebp
119
120	movl	%eax, 8(%edi,%ecx,4)
121	movl	(%esi,%ecx,4), %eax
122
123	movl	%edx, 4(%edi,%ecx,4)
124	movl	-4(%esi,%ecx,4), %edx
125
126	subl	$2, %ecx
127	jnbe	L(top)
128
129
130L(done_loop):
131	movl	%eax, 8(%edi,%ecx,4)
132	movl	%edx, 4(%edi,%ecx,4)
133
134	C copy low limb (needed if size was odd, but will already have been
135	C done in the loop if size was even)
136	movl	(%esi), %eax
137L(one):
138	movl	%eax, (%edi)
139	movl	SAVE_EDI, %edi
140	movl	SAVE_ESI, %esi
141
142	ret
143
144
145L(use_movsl):
146	C eax
147	C ebx
148	C ecx	size-3
149	C edx
150	C esi	src
151	C edi	dst
152	C ebp
153
154	addl	$3, %ecx
155
156	cld		C better safe than sorry, see mpn/x86/README
157
158	rep
159	movsl
160
161L(zero):
162	movl	SAVE_ESI, %esi
163	movl	SAVE_EDI, %edi
164
165	ret
166
167EPILOGUE()
168