xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mmx/copyi.asm (revision ca453df649ce9db45b64d73678ba06cbccf9aa11)
1dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
2
3dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
4dnl
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or
8dnl  modify it under the terms of the GNU Lesser General Public License as
9dnl  published by the Free Software Foundation; either version 3 of the
10dnl  License, or (at your option) any later version.
11dnl
12dnl  The GNU MP Library is distributed in the hope that it will be useful,
13dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
14dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15dnl  Lesser General Public License for more details.
16dnl
17dnl  You should have received a copy of the GNU Lesser General Public License
18dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20include(`../config.m4')
21
22
23C    alignment dst/src, A=0mod8 N=4mod8
24C       A/A   A/N   N/A   N/N
25C K7    0.75  1.0   1.0   0.75
26
27
28C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
29C
30C Copy src,size to dst,size.
31C
32C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
33C 1.33 c/l.
34C
35C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
36C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
37C under 0.7 c/l is known.  Apparently only two 32-bit stores can be done in
38C one cycle, so perhaps some scheduling is needed to ensure it's a
39C load+store in each cycle, not store+store.
40C
41C If both source and destination are unaligned then one limb is processed at
42C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
43C used unaligned it would be 1.5 c/l.
44
45defframe(PARAM_SIZE,12)
46defframe(PARAM_SRC, 8)
47defframe(PARAM_DST, 4)
48
49dnl  parameter space reused
50define(SAVE_EBX,`PARAM_SIZE')
51
52dnl  minimum 5 since the unrolled code can't handle less than 5
53deflit(UNROLL_THRESHOLD, 5)
54
55	TEXT
56	ALIGN(32)
57PROLOGUE(mpn_copyi)
58deflit(`FRAME',0)
59
60	movl	PARAM_SIZE, %ecx
61	movl	%ebx, SAVE_EBX
62
63	movl	PARAM_SRC, %eax
64	movl	PARAM_DST, %edx
65
66	cmpl	$UNROLL_THRESHOLD, %ecx
67	jae	L(unroll)
68
69	orl	%ecx, %ecx
70	jz	L(simple_done)
71
72L(simple):
73	C eax	src, incrementing
74	C ebx	scratch
75	C ecx	counter
76	C edx	dst, incrementing
77	C
78	C this loop is 2 cycles/limb
79
80	movl	(%eax), %ebx
81	movl	%ebx, (%edx)
82	decl	%ecx
83	leal	4(%eax), %eax
84	leal	4(%edx), %edx
85	jnz	L(simple)
86
87L(simple_done):
88	movl	SAVE_EBX, %ebx
89	ret
90
91
92L(unroll):
93	movl	%eax, %ebx
94	leal	-12(%eax,%ecx,4), %eax	C src end - 12
95	subl	$3, %ecx		C size-3
96
97	andl	%edx, %ebx
98	leal	(%edx,%ecx,4), %edx	C dst end - 12
99	negl	%ecx
100
101	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
102	jz	L(aligned)
103
104	C both src and dst unaligned, process one limb to align them
105	movl	(%eax,%ecx,4), %ebx
106	movl	%ebx, (%edx,%ecx,4)
107	incl	%ecx
108L(aligned):
109
110
111	ALIGN(16)
112L(top):
113	C eax	src end - 12
114	C ebx
115	C ecx	counter, negative, limbs
116	C edx	dst end - 12
117
118	movq	(%eax,%ecx,4), %mm0
119	movq	8(%eax,%ecx,4), %mm1
120	addl	$4, %ecx
121	movq	%mm0, -16(%edx,%ecx,4)
122	movq	%mm1, -16+8(%edx,%ecx,4)
123	ja	L(top)		C jump no carry and not zero
124
125
126	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
127
128	testb	$2, %cl
129	jnz	L(finish_not_two)
130
131	movq	(%eax,%ecx,4), %mm0
132	movq	%mm0, (%edx,%ecx,4)
133L(finish_not_two):
134
135	testb	$1, %cl
136	jnz	L(done)
137
138	movl	8(%eax), %ebx
139	movl	%ebx, 8(%edx)
140
141L(done):
142	movl	SAVE_EBX, %ebx
143	emms
144	ret
145
146EPILOGUE()
147