xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k7/mmx/com.asm (revision d16b7486a53dcb8072b60ec6fcb4373a2d0c27b7)
1dnl  AMD Athlon mpn_com -- mpn bitwise one's complement.
2
3dnl  Copyright 2002 Free Software Foundation, Inc.
4
5dnl  This file is part of the GNU MP Library.
6dnl
7dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8dnl  it under the terms of either:
9dnl
10dnl    * the GNU Lesser General Public License as published by the Free
11dnl      Software Foundation; either version 3 of the License, or (at your
12dnl      option) any later version.
13dnl
14dnl  or
15dnl
16dnl    * the GNU General Public License as published by the Free Software
17dnl      Foundation; either version 2 of the License, or (at your option) any
18dnl      later version.
19dnl
20dnl  or both in parallel, as here.
21dnl
22dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25dnl  for more details.
26dnl
27dnl  You should have received copies of the GNU General Public License and the
28dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29dnl  see https://www.gnu.org/licenses/.
30
31include(`../config.m4')
32
33
34C K7: 1.0 cycles/limb
35
36
37C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size);
38C
39C The loop form below is necessary for the claimed speed.  It needs to be
40C aligned to a 16 byte boundary and only 16 bytes long.  Maybe that's so it
41C fits in a BTB entry.  The adjustments to %eax and %edx avoid offsets on
42C the movq's and achieve the necessary size.
43C
44C If both src and dst are 4mod8, the loop runs at 1.5 c/l.  So long as one
45C of the two is 0mod8, it runs at 1.0 c/l.  On that basis dst is checked
46C (offset by the size, as per the loop addressing) and one high limb
47C processed separately to get alignment.
48C
49C The padding for the nails case is unattractive, but shouldn't cost any
50C cycles.  Explicit .byte's guarantee the desired instructions, at a point
51C where we're probably stalled waiting for loads anyway.
52C
53C Enhancements:
54C
55C The combination load/pxor/store might be able to be unrolled to approach
56C 0.5 c/l if desired.
57
58defframe(PARAM_SIZE,12)
59defframe(PARAM_SRC, 8)
60defframe(PARAM_DST, 4)
61
62	TEXT
63	ALIGN(16)
64
65PROLOGUE(mpn_com)
66deflit(`FRAME',0)
67
68	movl	PARAM_DST, %edx
69	movl	PARAM_SIZE, %ecx
70	pcmpeqd	%mm7, %mm7
71
72	leal	(%edx,%ecx,4), %eax
73	andl	$4, %eax
74ifelse(GMP_NAIL_BITS,0,,
75`	psrld	$GMP_NAIL_BITS, %mm7')		C GMP_NUMB_MASK
76
77	movl	PARAM_SRC, %eax
78	movd	-4(%eax,%ecx,4), %mm0		C src high limb
79
80ifelse(GMP_NAIL_BITS,0,,
81`	C padding for alignment below
82	.byte	0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00	C lea 0(%esi),%esi
83	.byte	0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00	C lea 0(%edi),%edi
84')
85
86	jz	L(aligned)
87
88	pxor	%mm7, %mm0
89	movd	%mm0, -4(%edx,%ecx,4)		C dst high limb
90	decl	%ecx
91	jz	L(done)
92L(aligned):
93
94	addl	$4, %eax
95	addl	$4, %edx
96	decl	%ecx
97	jz	L(one)
98
99	C offset 0x30 for no nails, or 0x40 for nails
100	ALIGN(16)
101L(top):
102	C eax	src
103	C ebx
104	C ecx	counter
105	C edx	dst
106
107	subl	$2, %ecx
108	movq	(%eax,%ecx,4), %mm0
109	pxor	%mm7, %mm0
110	movq	%mm0, (%edx,%ecx,4)
111	jg	L(top)
112
113	jnz	L(done)				C if size even
114
115L(one):
116	movd	-4(%eax), %mm0			C src low limb
117	pxor	%mm7, %mm0
118	movd	%mm0, -4(%edx)			C dst low limb
119
120L(done):
121	emms
122
123	ret
124
125EPILOGUE()
126