xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mmx/logops_n.asm (revision ca453df649ce9db45b64d73678ba06cbccf9aa11)
1dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
2dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
3
4dnl  Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5dnl
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or
9dnl  modify it under the terms of the GNU Lesser General Public License as
10dnl  published by the Free Software Foundation; either version 3 of the
11dnl  License, or (at your option) any later version.
12dnl
13dnl  The GNU MP Library is distributed in the hope that it will be useful,
14dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
15dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16dnl  Lesser General Public License for more details.
17dnl
18dnl  You should have received a copy of the GNU Lesser General Public License
19dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21include(`../config.m4')
22
23NAILS_SUPPORT(0-31)
24
25
26C         alignment dst/src1/src2, A=0mod8, N=4mod8
27C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
28C
29C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
30C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
31C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
32C
33C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
34C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
35C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
36
37
38dnl  M4_p and M4_i are the MMX and integer instructions
39dnl  M4_*_neg_dst means whether to negate the final result before writing
40dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
41
42define(M4_choose_op,
43m4_assert_numargs(7)
44`ifdef(`OPERATION_$1',`
45define(`M4_function',  `mpn_$1')
46define(`M4_operation', `$1')
47define(`M4_p',         `$2')
48define(`M4_p_neg_dst', `$3')
49define(`M4_p_neg_src2',`$4')
50define(`M4_i',         `$5')
51define(`M4_i_neg_dst', `$6')
52define(`M4_i_neg_src2',`$7')
53')')
54
55dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
56dnl  style (the two are equivalent for xor).
57dnl
58dnl  pandn can't be used with nails.
59
60M4_choose_op( and_n,  pand,0,0,  andl,0,0)
61ifelse(GMP_NAIL_BITS,0,
62`M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
63`M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
64M4_choose_op( nand_n, pand,1,0,  andl,1,0)
65M4_choose_op( ior_n,  por,0,0,   orl,0,0)
66M4_choose_op( iorn_n, por,0,1,   orl,0,1)
67M4_choose_op( nior_n, por,1,0,   orl,1,0)
68M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
69M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
70
71ifdef(`M4_function',,
72`m4_error(`Unrecognised or undefined OPERATION symbol
73')')
74
75MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
76
77
78C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
79C                   mp_size_t size);
80C
81C Do src1,size M4_operation src2,size, storing the result in dst,size.
82C
83C Unaligned movq loads and stores are a bit slower than aligned ones.  The
84C test at the start of the routine checks the alignment of src1 and if
85C necessary processes one limb separately at the low end to make it aligned.
86C
87C The raw speeds without this alignment switch are as follows.
88C
89C           alignment dst/src1/src2, A=0mod8, N=4mod8
90C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
91C
92C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
93C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
94C K6                 2.0    2.25                2.35   2.28   nand,nior
95C
96C
97C Future:
98C
99C K6 can do one 64-bit load per cycle so each of these routines should be
100C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
101C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
102C The others are 4 instructions per 2 limbs, and so can only approach 1.0
103C because there's nowhere to hide some loop control.
104
105defframe(PARAM_SIZE,16)
106defframe(PARAM_SRC2,12)
107defframe(PARAM_SRC1,8)
108defframe(PARAM_DST, 4)
109deflit(`FRAME',0)
110
111	TEXT
112	ALIGN(32)
113PROLOGUE(M4_function)
114			movl	PARAM_SIZE, %ecx
115			pushl	%ebx		FRAME_pushl()
116
117			movl	PARAM_SRC1, %eax
118
119			movl	PARAM_SRC2, %ebx
120			cmpl	$1, %ecx
121
122			movl	PARAM_DST, %edx
123			ja	L(two_or_more)
124
125
126			movl	(%ebx), %ecx
127			popl	%ebx
128ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
129			M4_i	(%eax), %ecx
130ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
131			movl	%ecx, (%edx)
132
133			ret
134
135
136L(two_or_more):
137			C eax	src1
138			C ebx	src2
139			C ecx	size
140			C edx	dst
141			C esi
142			C edi
143			C ebp
144
145			pushl	%esi		FRAME_pushl()
146			testl	$4, %eax
147			jz	L(alignment_ok)
148
149			movl	(%ebx), %esi
150			addl	$4, %ebx
151ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%esi)')
152			M4_i	(%eax), %esi
153			addl	$4, %eax
154ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%esi)')
155			movl	%esi, (%edx)
156			addl	$4, %edx
157			decl	%ecx
158
159L(alignment_ok):
160			movl	%ecx, %esi
161			shrl	%ecx
162			jnz	L(still_two_or_more)
163
164			movl	(%ebx), %ecx
165			popl	%esi
166ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
167			M4_i	(%eax), %ecx
168ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
169			popl	%ebx
170			movl	%ecx, (%edx)
171			ret
172
173
174L(still_two_or_more):
175ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
176			pcmpeqd	%mm7, %mm7		C all ones
177ifelse(GMP_NAIL_BITS,0,,`psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
178')
179
180			ALIGN(16)
181L(top):
182			C eax	src1
183			C ebx	src2
184			C ecx	counter
185			C edx	dst
186			C esi
187			C edi
188			C ebp
189			C
190			C carry bit is low of size
191
192			movq	-8(%ebx,%ecx,8), %mm0
193ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
194			M4_p	-8(%eax,%ecx,8), %mm0
195ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
196			movq	%mm0, -8(%edx,%ecx,8)
197
198			loop	L(top)
199
200
201			jnc	L(no_extra)
202
203			movl	-4(%ebx,%esi,4), %ebx
204ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
205			M4_i	-4(%eax,%esi,4), %ebx
206ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
207			movl	%ebx, -4(%edx,%esi,4)
208L(no_extra):
209
210			popl	%esi
211			popl	%ebx
212			emms_or_femms
213			ret
214
215EPILOGUE()
216