xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mmx/copyi.asm (revision c38e7cc395b1472a774ff828e46123de44c628e9)
1dnl  AMD K7 mpn_copyi -- copy limb vector, incrementing.
2
3dnl  Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C    alignment dst/src, A=0mod8 N=4mod8
35C       A/A   A/N   N/A   N/N
36C K7    0.75  1.0   1.0   0.75
37
38
39C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
40C
41C Copy src,size to dst,size.
42C
43C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at
44C 1.33 c/l.
45C
46C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization
47C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing
48C under 0.7 c/l is known.  Apparently only two 32-bit stores can be done in
49C one cycle, so perhaps some scheduling is needed to ensure it's a
50C load+store in each cycle, not store+store.
51C
52C If both source and destination are unaligned then one limb is processed at
53C the start to make them aligned and so get 0.75 c/l, whereas if they'd been
54C used unaligned it would be 1.5 c/l.
55
56defframe(PARAM_SIZE,12)
57defframe(PARAM_SRC, 8)
58defframe(PARAM_DST, 4)
59
60dnl  parameter space reused
61define(SAVE_EBX,`PARAM_SIZE')
62
63dnl  minimum 5 since the unrolled code can't handle less than 5
64deflit(UNROLL_THRESHOLD, 5)
65
66	TEXT
67	ALIGN(32)
68PROLOGUE(mpn_copyi)
69deflit(`FRAME',0)
70
71	movl	PARAM_SIZE, %ecx
72	movl	%ebx, SAVE_EBX
73
74	movl	PARAM_SRC, %eax
75	movl	PARAM_DST, %edx
76
77	cmpl	$UNROLL_THRESHOLD, %ecx
78	jae	L(unroll)
79
80	orl	%ecx, %ecx
81	jz	L(simple_done)
82
83L(simple):
84	C eax	src, incrementing
85	C ebx	scratch
86	C ecx	counter
87	C edx	dst, incrementing
88	C
89	C this loop is 2 cycles/limb
90
91	movl	(%eax), %ebx
92	movl	%ebx, (%edx)
93	decl	%ecx
94	leal	4(%eax), %eax
95	leal	4(%edx), %edx
96	jnz	L(simple)
97
98L(simple_done):
99	movl	SAVE_EBX, %ebx
100	ret
101
102
103L(unroll):
104	movl	%eax, %ebx
105	leal	-12(%eax,%ecx,4), %eax	C src end - 12
106	subl	$3, %ecx		C size-3
107
108	andl	%edx, %ebx
109	leal	(%edx,%ecx,4), %edx	C dst end - 12
110	negl	%ecx
111
112	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
113	jz	L(aligned)
114
115	C both src and dst unaligned, process one limb to align them
116	movl	(%eax,%ecx,4), %ebx
117	movl	%ebx, (%edx,%ecx,4)
118	incl	%ecx
119L(aligned):
120
121
122	ALIGN(16)
123L(top):
124	C eax	src end - 12
125	C ebx
126	C ecx	counter, negative, limbs
127	C edx	dst end - 12
128
129	movq	(%eax,%ecx,4), %mm0
130	movq	8(%eax,%ecx,4), %mm1
131	addl	$4, %ecx
132	movq	%mm0, -16(%edx,%ecx,4)
133	movq	%mm1, -16+8(%edx,%ecx,4)
134	ja	L(top)		C jump no carry and not zero
135
136
137	C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining
138
139	testb	$2, %cl
140	jnz	L(finish_not_two)
141
142	movq	(%eax,%ecx,4), %mm0
143	movq	%mm0, (%edx,%ecx,4)
144L(finish_not_two):
145
146	testb	$1, %cl
147	jnz	L(done)
148
149	movl	8(%eax), %ebx
150	movl	%ebx, 8(%edx)
151
152L(done):
153	movl	SAVE_EBX, %ebx
154	emms
155	ret
156
157EPILOGUE()
158