xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mmx/copyd.asm (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1dnl  AMD K7 mpn_copyd -- copy limb vector, decrementing.
2
3dnl  Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C    alignment dst/src, A=0mod8 N=4mod8
35C       A/A   A/N   N/A   N/N
36C K7    0.75  1.0   1.0   0.75
37
38
39C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size);
40C
41C The various comments in mpn/x86/k7/copyi.asm apply here too.
42
43defframe(PARAM_SIZE,12)
44defframe(PARAM_SRC, 8)
45defframe(PARAM_DST, 4)
46deflit(`FRAME',0)
47
48dnl  parameter space reused
49define(SAVE_EBX,`PARAM_SIZE')
50define(SAVE_ESI,`PARAM_SRC')
51
52dnl  minimum 5 since the unrolled code can't handle less than 5
53deflit(UNROLL_THRESHOLD, 5)
54
55	TEXT
56	ALIGN(32)
57PROLOGUE(mpn_copyd)
58
59	movl	PARAM_SIZE, %ecx
60	movl	%ebx, SAVE_EBX
61
62	movl	PARAM_SRC, %eax
63	movl	PARAM_DST, %edx
64
65	cmpl	$UNROLL_THRESHOLD, %ecx
66	jae	L(unroll)
67
68	orl	%ecx, %ecx
69	jz	L(simple_done)
70
71L(simple):
72	C eax	src
73	C ebx	scratch
74	C ecx	counter
75	C edx	dst
76	C
77	C this loop is 2 cycles/limb
78
79	movl	-4(%eax,%ecx,4), %ebx
80	movl	%ebx, -4(%edx,%ecx,4)
81	decl	%ecx
82	jnz	L(simple)
83
84L(simple_done):
85	movl	SAVE_EBX, %ebx
86	ret
87
88
89L(unroll):
90	movl	%esi, SAVE_ESI
91	leal	(%eax,%ecx,4), %ebx
92	leal	(%edx,%ecx,4), %esi
93
94	andl	%esi, %ebx
95	movl	SAVE_ESI, %esi
96	subl	$4, %ecx		C size-4
97
98	testl	$4, %ebx   C testl to pad code closer to 16 bytes for L(top)
99	jz	L(aligned)
100
101	C both src and dst unaligned, process one limb to align them
102	movl	12(%eax,%ecx,4), %ebx
103	movl	%ebx, 12(%edx,%ecx,4)
104	decl	%ecx
105L(aligned):
106
107
108	ALIGN(16)
109L(top):
110	C eax	src
111	C ebx
112	C ecx	counter, limbs
113	C edx	dst
114
115	movq	8(%eax,%ecx,4), %mm0
116	movq	(%eax,%ecx,4), %mm1
117	subl	$4, %ecx
118	movq	%mm0, 16+8(%edx,%ecx,4)
119	movq	%mm1, 16(%edx,%ecx,4)
120	jns	L(top)
121
122
123	C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining
124
125	testb	$2, %cl
126	jz	L(finish_not_two)
127
128	movq	8(%eax,%ecx,4), %mm0
129	movq	%mm0, 8(%edx,%ecx,4)
130L(finish_not_two):
131
132	testb	$1, %cl
133	jz	L(done)
134
135	movl	(%eax), %ebx
136	movl	%ebx, (%edx)
137
138L(done):
139	movl	SAVE_EBX, %ebx
140	emms
141	ret
142
143
144EPILOGUE()
145