1dnl AMD K6-2 mpn_and_n, mpn_andn_n, mpn_nand_n, mpn_ior_n, mpn_iorn_n, 2dnl mpn_nior_n, mpn_xor_n, mpn_xnor_n -- mpn bitwise logical operations. 3 4dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23NAILS_SUPPORT(0-31) 24 25 26C alignment dst/src1/src2, A=0mod8, N=4mod8 27C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 28C 29C K6-2 1.2 1.5 1.5 1.2 1.2 1.5 1.5 1.2 and,andn,ior,xor 30C K6-2 1.5 1.75 2.0 1.75 1.75 2.0 1.75 1.5 iorn,xnor 31C K6-2 1.75 2.0 2.0 2.0 2.0 2.0 2.0 1.75 nand,nior 32C 33C K6 1.5 1.68 1.75 1.2 1.75 1.75 1.68 1.5 and,andn,ior,xor 34C K6 2.0 2.0 2.25 2.25 2.25 2.25 2.0 2.0 iorn,xnor 35C K6 2.0 2.25 2.25 2.25 2.25 2.25 2.25 2.0 nand,nior 36 37 38dnl M4_p and M4_i are the MMX and integer instructions 39dnl M4_*_neg_dst means whether to negate the final result before writing 40dnl M4_*_neg_src2 means whether to negate the src2 values before using them 41 42define(M4_choose_op, 43m4_assert_numargs(7) 44`ifdef(`OPERATION_$1',` 45define(`M4_function', `mpn_$1') 46define(`M4_operation', `$1') 47define(`M4_p', `$2') 48define(`M4_p_neg_dst', `$3') 49define(`M4_p_neg_src2',`$4') 50define(`M4_i', `$5') 51define(`M4_i_neg_dst', `$6') 52define(`M4_i_neg_src2',`$7') 53')') 54 55dnl xnor is done in "iorn" style because it's a touch faster than "nior" 56dnl style (the two are equivalent for xor). 57dnl 58dnl pandn can't be used with nails. 59 60M4_choose_op( and_n, pand,0,0, andl,0,0) 61ifelse(GMP_NAIL_BITS,0, 62`M4_choose_op(andn_n, pandn,0,0, andl,0,1)', 63`M4_choose_op(andn_n, pand,0,1, andl,0,1)') 64M4_choose_op( nand_n, pand,1,0, andl,1,0) 65M4_choose_op( ior_n, por,0,0, orl,0,0) 66M4_choose_op( iorn_n, por,0,1, orl,0,1) 67M4_choose_op( nior_n, por,1,0, orl,1,0) 68M4_choose_op( xor_n, pxor,0,0, xorl,0,0) 69M4_choose_op( xnor_n, pxor,0,1, xorl,0,1) 70 71ifdef(`M4_function',, 72`m4_error(`Unrecognised or undefined OPERATION symbol 73')') 74 75MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 76 77 78C void M4_function (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 79C mp_size_t size); 80C 81C Do src1,size M4_operation src2,size, storing the result in dst,size. 82C 83C Unaligned movq loads and stores are a bit slower than aligned ones. The 84C test at the start of the routine checks the alignment of src1 and if 85C necessary processes one limb separately at the low end to make it aligned. 86C 87C The raw speeds without this alignment switch are as follows. 88C 89C alignment dst/src1/src2, A=0mod8, N=4mod8 90C A/A/A A/A/N A/N/A A/N/N N/A/A N/A/N N/N/A N/N/N 91C 92C K6 1.5 2.0 1.5 2.0 and,andn,ior,xor 93C K6 1.75 2.2 2.0 2.28 iorn,xnor 94C K6 2.0 2.25 2.35 2.28 nand,nior 95C 96C 97C Future: 98C 99C K6 can do one 64-bit load per cycle so each of these routines should be 100C able to approach 1.0 c/l, if aligned. The basic and/andn/ior/xor might be 101C able to get 1.0 with just a 4 limb loop, being 3 instructions per 2 limbs. 102C The others are 4 instructions per 2 limbs, and so can only approach 1.0 103C because there's nowhere to hide some loop control. 104 105defframe(PARAM_SIZE,16) 106defframe(PARAM_SRC2,12) 107defframe(PARAM_SRC1,8) 108defframe(PARAM_DST, 4) 109deflit(`FRAME',0) 110 111 TEXT 112 ALIGN(32) 113PROLOGUE(M4_function) 114 movl PARAM_SIZE, %ecx 115 pushl %ebx FRAME_pushl() 116 117 movl PARAM_SRC1, %eax 118 119 movl PARAM_SRC2, %ebx 120 cmpl $1, %ecx 121 122 movl PARAM_DST, %edx 123 ja L(two_or_more) 124 125 126 movl (%ebx), %ecx 127 popl %ebx 128ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 129 M4_i (%eax), %ecx 130ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 131 movl %ecx, (%edx) 132 133 ret 134 135 136L(two_or_more): 137 C eax src1 138 C ebx src2 139 C ecx size 140 C edx dst 141 C esi 142 C edi 143 C ebp 144 145 pushl %esi FRAME_pushl() 146 testl $4, %eax 147 jz L(alignment_ok) 148 149 movl (%ebx), %esi 150 addl $4, %ebx 151ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %esi)') 152 M4_i (%eax), %esi 153 addl $4, %eax 154ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %esi)') 155 movl %esi, (%edx) 156 addl $4, %edx 157 decl %ecx 158 159L(alignment_ok): 160 movl %ecx, %esi 161 shrl %ecx 162 jnz L(still_two_or_more) 163 164 movl (%ebx), %ecx 165 popl %esi 166ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ecx)') 167 M4_i (%eax), %ecx 168ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ecx)') 169 popl %ebx 170 movl %ecx, (%edx) 171 ret 172 173 174L(still_two_or_more): 175ifelse(eval(M4_p_neg_src2 || M4_p_neg_dst),1,` 176 pcmpeqd %mm7, %mm7 C all ones 177ifelse(GMP_NAIL_BITS,0,,`psrld $GMP_NAIL_BITS, %mm7') C clear nails 178') 179 180 ALIGN(16) 181L(top): 182 C eax src1 183 C ebx src2 184 C ecx counter 185 C edx dst 186 C esi 187 C edi 188 C ebp 189 C 190 C carry bit is low of size 191 192 movq -8(%ebx,%ecx,8), %mm0 193ifelse(M4_p_neg_src2,1,`pxor %mm7, %mm0') 194 M4_p -8(%eax,%ecx,8), %mm0 195ifelse(M4_p_neg_dst,1,` pxor %mm7, %mm0') 196 movq %mm0, -8(%edx,%ecx,8) 197 198 loop L(top) 199 200 201 jnc L(no_extra) 202 203 movl -4(%ebx,%esi,4), %ebx 204ifelse(M4_i_neg_src2,1,`notl_or_xorl_GMP_NUMB_MASK( %ebx)') 205 M4_i -4(%eax,%esi,4), %ebx 206ifelse(M4_i_neg_dst,1,` notl_or_xorl_GMP_NUMB_MASK( %ebx)') 207 movl %ebx, -4(%edx,%esi,4) 208L(no_extra): 209 210 popl %esi 211 popl %ebx 212 emms_or_femms 213 ret 214 215EPILOGUE() 216