1dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, 2dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. 3 4dnl Copyright 1999-2002 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34NAILS_SUPPORT(0-31) 35 36 37C alignment dst/src1/src2, A=0mod8, N=4mod8 38C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 39C 40C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor 41C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor 42C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior 43C 44C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor 45C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor 46C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior 47 48 49dnl M4_p and M4_i are the MMX and integer instructions 50dnl M4_*_neg_dst means whether to negate the final result before writing 51dnl M4_*_neg_src2 means whether to negate the src2 values before using them 52 53define(M4_choose_op, 54m4_assert_numargs(7) 55`ifdef(`OPERATION_$1',` 56define(`M4_function', `mpn_$1') 57define(`M4_operation', `$1') 58define(`M4_p', `$2') 59define(`M4_p_neg_dst', `$3') 60define(`M4_p_neg_src2',`$4') 61define(`M4_i', `$5') 62define(`M4_i_neg_dst', `$6') 63define(`M4_i_neg_src2',`$7') 64')') 65 66dnl xnor is done in "iorn" style because it's a touch faster than "nior" 67dnl style (the two are equivalent for xor). 68dnl 69dnl pandn can't be used with nails. 70 71M4_choose_op( and_n, pand,0,0, andl,0,0) 72ifelse(GMP_NAIL_BITS,0, 73`M4_choose_op(andn_n, pandn,0,0, andl,0,1)', 74`M4_choose_op(andn_n, pand,0,1, andl,0,1)') 75M4_choose_op( nand_n, pand,1,0, andl,1,0) 76M4_choose_op( ior_n, por,0,0, orl,0,0) 77M4_choose_op( iorn_n, por,0,1, orl,0,1) 78M4_choose_op( nior_n, por,1,0, orl,1,0) 79M4_choose_op( xor_n, pxor,0,0, xorl,0,0) 80M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) 81 82ifdef(`M4_function',, 83`m4_error(`Unrecognised or undefined OPERATION symbol 84')') 85 86MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 87 88 89C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 90C mp_size_t size); 91C 92C Do src1,size M4_operation src2,size, storing the result in dst,size. 93C 94C Unaligned movq loads and stores are a bit slower than aligned ones. The 95C test at the start of the routine checks the alignment of src1 and if 96C necessary processes one limb separately at the low end to make it aligned. 97C 98C The raw speeds without this alignment switch are as follows. 99C 100C alignment dst/src1/src2, A=0mod8, N=4mod8 101C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 102C 103C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor 104C K6 1.75 2.2 2.0 2.28 iorn,xnor 105C K6 2.0 2.25 2.35 2.28 nand,nior 106C 107C 108C Future: 109C 110C K6 can do one 64-bit load per cycle so each of these routines should be 111C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be 112C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. 113C The others are 4 instructions per 2 limbs, and so can only approach 1.0 114C because there's nowhere to hide some loop control. 115 116defframe(PARAM_SIZE,16) 117defframe(PARAM_SRC2,12) 118defframe(PARAM_SRC1,8) 119defframe(PARAM_DST, 4) 120deflit(`FRAME',0) 121 122 TEXT 123 ALIGN(32) 124PROLOGUE(M4_function) 125 movl PARAM_SIZE, %ecx 126 pushl %ebx FRAME_pushl() 127 128 movl PARAM_SRC1, %eax 129 130 movl PARAM_SRC2, %ebx 131 cmpl $1, %ecx 132 133 movl PARAM_DST, %edx 134 ja L(two_or_more) 135 136 137 movl (%ebx), %ecx 138 popl %ebx 139ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 140 M4_i (%eax), %ecx 141ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 142 movl %ecx, (%edx) 143 144 ret 145 146 147L(two_or_more): 148 C eax src1 149 C ebx src2 150 C ecx size 151 C edx dst 152 C esi 153 C edi 154 C ebp 155 156 pushl %esi FRAME_pushl() 157 testl $4, %eax 158 jz L(alignment_ok) 159 160 movl (%ebx), %esi 161 addl $4, %ebx 162ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') 163 M4_i (%eax), %esi 164 addl $4, %eax 165ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') 166 movl %esi, (%edx) 167 addl $4, %edx 168 decl %ecx 169 170L(alignment_ok): 171 movl %ecx, %esi 172 shrl %ecx 173 jnz L(still_two_or_more) 174 175 movl (%ebx), %ecx 176 popl %esi 177ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 178 M4_i (%eax), %ecx 179ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 180 popl %ebx 181 movl %ecx, (%edx) 182 ret 183 184 185L(still_two_or_more): 186ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` 187 pcmpeqd %mm7, %mm7 C all ones 188ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails 189') 190 191 ALIGN(16) 192L(top): 193 C eax src1 194 C ebx src2 195 C ecx counter 196 C edx dst 197 C esi 198 C edi 199 C ebp 200 C 201 C carry bit is low of size 202 203 movq -8(%ebx,%ecx,8), %mm0 204ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') 205 M4_p -8(%eax,%ecx,8), %mm0 206ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') 207 movq %mm0, -8(%edx,%ecx,8) 208 209 loop L(top) 210 211 212 jnc L(no_extra) 213 214 movl -4(%ebx,%esi,4), %ebx 215ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') 216 M4_i -4(%eax,%esi,4), %ebx 217ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') 218 movl %ebx, -4(%edx,%esi,4) 219L(no_extra): 220 221 popl %esi 222 popl %ebx 223 emms_or_femms 224 ret 225 226EPILOGUE() 227