xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/pentium4/copyi.asm (revision f14316bcbc544b96a93e884bc5c2b15fd60e22ae)
1dnl  Pentium-4 mpn_copyi -- copy limb vector, incrementing.
2dnl
3
4dnl  Copyright 1999, 2000, 2001 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21
22dnl  The rep/movsl is very slow for small blocks on pentium4.  Its startup
23dnl  time seems to be about 110 cycles.  It then copies at a rate of one
24dnl  limb per cycle.  We therefore fall back to an open-coded 2 c/l copying
25dnl  loop for smaller sizes.
26
27dnl  Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some
28dnl  nifty unrolled arrangement.  Clearly, that could reach much higher
29dnl  speeds, at least for large blocks.
30
31include(`../config.m4')
32
33
34defframe(PARAM_SIZE, 12)
35defframe(PARAM_SRC, 8)
36defframe(PARAM_DST,  4)
37
38	TEXT
39	ALIGN(8)
40
41PROLOGUE(mpn_copyi)
42deflit(`FRAME',0)
43
44	movl	PARAM_SIZE, %ecx
45	cmpl	$150, %ecx
46	jg	L(replmovs)
47
48	movl	PARAM_SRC, %eax
49	movl	PARAM_DST, %edx
50	movl	%ebx, PARAM_SIZE
51	testl	%ecx, %ecx
52	jz	L(end)
53
54L(loop):
55	movl	(%eax), %ebx
56	leal	4(%eax), %eax
57	addl	$-1, %ecx
58	movl	%ebx, (%edx)
59	leal	4(%edx), %edx
60
61	jnz	L(loop)
62
63L(end):
64	movl	PARAM_SIZE, %ebx
65	ret
66
67L(replmovs):
68	cld	C better safe than sorry, see mpn/x86/README
69
70	movl	%esi, %eax
71	movl	PARAM_SRC, %esi
72	movl	%edi, %edx
73	movl	PARAM_DST, %edi
74
75	rep
76	movsl
77
78	movl	%eax, %esi
79	movl	%edx, %edi
80
81	ret
82
83EPILOGUE()
84