xref: /netbsd-src/external/lgpl3/gmp/dist/mpn/x86/k6/mmx/logops_n.asm (revision ce54336801cf28877c3414aa2fcb251dddd543a2)
1dnl  AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n,
2dnl  mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations.
3
4dnl  Copyright 1999-2002 Free Software Foundation, Inc.
5
6dnl  This file is part of the GNU MP Library.
7dnl
8dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9dnl  it under the terms of either:
10dnl
11dnl    * the GNU Lesser General Public License as published by the Free
12dnl      Software Foundation; either version 3 of the License, or (at your
13dnl      option) any later version.
14dnl
15dnl  or
16dnl
17dnl    * the GNU General Public License as published by the Free Software
18dnl      Foundation; either version 2 of the License, or (at your option) any
19dnl      later version.
20dnl
21dnl  or both in parallel, as here.
22dnl
23dnl  The GNU MP Library is distributed in the hope that it will be useful, but
24dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26dnl  for more details.
27dnl
28dnl  You should have received copies of the GNU General Public License and the
29dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
30dnl  see https://www.gnu.org/licenses/.
31
32include(`../config.m4')
33
34NAILS_SUPPORT(0-31)
35
36
37C         alignment dst/src1/src2, A=0mod8, N=4mod8
38C      A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N
39C
40C K6-2  1.2   1.5   1.5   1.2   1.2   1.5   1.5   1.2   and,andn,ior,xor
41C K6-2  1.5   1.75  2.0   1.75  1.75  2.0   1.75  1.5   iorn,xnor
42C K6-2  1.75  2.0   2.0   2.0   2.0   2.0   2.0   1.75  nand,nior
43C
44C K6    1.5   1.68  1.75  1.2   1.75  1.75  1.68  1.5   and,andn,ior,xor
45C K6    2.0   2.0   2.25  2.25  2.25  2.25  2.0   2.0   iorn,xnor
46C K6    2.0   2.25  2.25  2.25  2.25  2.25  2.25  2.0   nand,nior
47
48
49dnl  M4_p and M4_i are the MMX and integer instructions
50dnl  M4_*_neg_dst means whether to negate the final result before writing
51dnl  M4_*_neg_src2 means whether to negate the src2 values before using them
52
53define(M4_choose_op,
54m4_assert_numargs(7)
55`ifdef(`OPERATION_$1',`
56define(`M4_function',  `mpn_$1')
57define(`M4_operation', `$1')
58define(`M4_p',         `$2')
59define(`M4_p_neg_dst', `$3')
60define(`M4_p_neg_src2',`$4')
61define(`M4_i',         `$5')
62define(`M4_i_neg_dst', `$6')
63define(`M4_i_neg_src2',`$7')
64')')
65
66dnl  xnor is done in "iorn" style because it's a touch faster than "nior"
67dnl  style (the two are equivalent for xor).
68dnl
69dnl  pandn can't be used with nails.
70
71M4_choose_op( and_n,  pand,0,0,  andl,0,0)
72ifelse(GMP_NAIL_BITS,0,
73`M4_choose_op(andn_n, pandn,0,0, andl,0,1)',
74`M4_choose_op(andn_n, pand,0,1,  andl,0,1)')
75M4_choose_op( nand_n, pand,1,0,  andl,1,0)
76M4_choose_op( ior_n,  por,0,0,   orl,0,0)
77M4_choose_op( iorn_n, por,0,1,   orl,0,1)
78M4_choose_op( nior_n, por,1,0,   orl,1,0)
79M4_choose_op( xor_n,  pxor,0,0,  xorl,0,0)
80M4_choose_op( xnor_n, pxor,0,1,  xorl,0,1)
81
82ifdef(`M4_function',,
83`m4_error(`Unrecognised or undefined OPERATION symbol
84')')
85
86MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n)
87
88
89C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
90C                   mp_size_t size);
91C
92C Do src1,size M4_operation src2,size, storing the result in dst,size.
93C
94C Unaligned movq loads and stores are a bit slower than aligned ones.  The
95C test at the start of the routine checks the alignment of src1 and if
96C necessary processes one limb separately at the low end to make it aligned.
97C
98C The raw speeds without this alignment switch are as follows.
99C
100C           alignment dst/src1/src2, A=0mod8, N=4mod8
101C     A/A/A  A/A/N  A/N/A  A/N/N  N/A/A  N/A/N  N/N/A  N/N/N
102C
103C K6                 1.5    2.0                 1.5    2.0    and,andn,ior,xor
104C K6                 1.75   2.2                 2.0    2.28   iorn,xnor
105C K6                 2.0    2.25                2.35   2.28   nand,nior
106C
107C
108C Future:
109C
110C K6 can do one 64-bit load per cycle so each of these routines should be
111C able to approach 1.0 c/l, if aligned.  The basic and/andn/ior/xor might be
112C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs.
113C The others are 4 instructions per 2 limbs, and so can only approach 1.0
114C because there's nowhere to hide some loop control.
115
116defframe(PARAM_SIZE,16)
117defframe(PARAM_SRC2,12)
118defframe(PARAM_SRC1,8)
119defframe(PARAM_DST, 4)
120deflit(`FRAME',0)
121
122	TEXT
123	ALIGN(32)
124PROLOGUE(M4_function)
125			movl	PARAM_SIZE, %ecx
126			pushl	%ebx		FRAME_pushl()
127
128			movl	PARAM_SRC1, %eax
129
130			movl	PARAM_SRC2, %ebx
131			cmpl	$1, %ecx
132
133			movl	PARAM_DST, %edx
134			ja	L(two_or_more)
135
136
137			movl	(%ebx), %ecx
138			popl	%ebx
139ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
140			M4_i	(%eax), %ecx
141ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
142			movl	%ecx, (%edx)
143
144			ret
145
146
147L(two_or_more):
148			C eax	src1
149			C ebx	src2
150			C ecx	size
151			C edx	dst
152			C esi
153			C edi
154			C ebp
155
156			pushl	%esi		FRAME_pushl()
157			testl	$4, %eax
158			jz	L(alignment_ok)
159
160			movl	(%ebx), %esi
161			addl	$4, %ebx
162ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%esi)')
163			M4_i	(%eax), %esi
164			addl	$4, %eax
165ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%esi)')
166			movl	%esi, (%edx)
167			addl	$4, %edx
168			decl	%ecx
169
170L(alignment_ok):
171			movl	%ecx, %esi
172			shrl	%ecx
173			jnz	L(still_two_or_more)
174
175			movl	(%ebx), %ecx
176			popl	%esi
177ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
178			M4_i	(%eax), %ecx
179ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ecx)')
180			popl	%ebx
181			movl	%ecx, (%edx)
182			ret
183
184
185L(still_two_or_more):
186ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,`
187			pcmpeqd	%mm7, %mm7		C all ones
188ifelse(GMP_NAIL_BITS,0,,`psrld	$GMP_NAIL_BITS, %mm7')	C clear nails
189')
190
191			ALIGN(16)
192L(top):
193			C eax	src1
194			C ebx	src2
195			C ecx	counter
196			C edx	dst
197			C esi
198			C edi
199			C ebp
200			C
201			C carry bit is low of size
202
203			movq	-8(%ebx,%ecx,8), %mm0
204ifelse(M4_p_neg_src2,1,`pxor	%mm7, %mm0')
205			M4_p	-8(%eax,%ecx,8), %mm0
206ifelse(M4_p_neg_dst,1,`	pxor	%mm7, %mm0')
207			movq	%mm0, -8(%edx,%ecx,8)
208
209			loop	L(top)
210
211
212			jnc	L(no_extra)
213
214			movl	-4(%ebx,%esi,4), %ebx
215ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
216			M4_i	-4(%eax,%esi,4), %ebx
217ifelse(M4_i_neg_dst,1,`	notl_or_xorl_GMP_NUMB_MASK(	%ebx)')
218			movl	%ebx, -4(%edx,%esi,4)
219L(no_extra):
220
221			popl	%esi
222			popl	%ebx
223			emms_or_femms
224			ret
225
226EPILOGUE()
227