1dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction. 2 3dnl Copyright 1999-2002 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb. 35 36 37ifdef(`OPERATION_add_n', ` 38 define(M4_inst, adcl) 39 define(M4_function_n, mpn_add_n) 40 define(M4_function_nc, mpn_add_nc) 41 define(M4_description, add) 42',`ifdef(`OPERATION_sub_n', ` 43 define(M4_inst, sbbl) 44 define(M4_function_n, mpn_sub_n) 45 define(M4_function_nc, mpn_sub_nc) 46 define(M4_description, subtract) 47',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n 48')')') 49 50MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 51 52 53C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 54C mp_size_t size); 55C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 56C mp_size_t size, mp_limb_t carry); 57C 58C Calculate src1,size M4_description src2,size, and store the result in 59C dst,size. The return value is the carry bit from the top of the result 60C (1 or 0). 61C 62C The _nc version accepts 1 or 0 for an initial carry into the low limb of 63C the calculation. Note values other than 1 or 0 here will lead to garbage 64C results. 65C 66C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and 67C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of 68C loop control, which with 4 limbs/loop means an extra 0.25 c/l. 69 70define(PARAM_CARRY, `FRAME+20(%esp)') 71define(PARAM_SIZE, `FRAME+16(%esp)') 72define(PARAM_SRC2, `FRAME+12(%esp)') 73define(PARAM_SRC1, `FRAME+8(%esp)') 74define(PARAM_DST, `FRAME+4(%esp)') 75deflit(`FRAME',0) 76 77dnl minimum 5 because the unrolled code can't handle less 78deflit(UNROLL_THRESHOLD, 5) 79 80 TEXT 81 ALIGN(32) 82 83PROLOGUE(M4_function_nc) 84 movl PARAM_CARRY, %eax 85 jmp L(start) 86EPILOGUE() 87 88 89PROLOGUE(M4_function_n) 90 xorl %eax, %eax 91L(start): 92 movl PARAM_SIZE, %ecx 93 pushl %ebx 94FRAME_pushl() 95 96 movl PARAM_SRC1, %ebx 97 pushl %edi 98FRAME_pushl() 99 100 movl PARAM_SRC2, %edx 101 cmpl $UNROLL_THRESHOLD, %ecx 102 103 movl PARAM_DST, %edi 104 jae L(unroll) 105 106 107 shrl %eax C initial carry flag 108 109 C offset 0x21 here, close enough to aligned 110L(simple): 111 C eax scratch 112 C ebx src1 113 C ecx counter 114 C edx src2 115 C esi 116 C edi dst 117 C ebp 118 C 119 C The store to (%edi) could be done with a stosl; it'd be smaller 120 C code, but there's no speed gain and a cld would have to be added 121 C (per mpn/x86/README). 122 123 movl (%ebx), %eax 124 leal 4(%ebx), %ebx 125 126 M4_inst (%edx), %eax 127 128 movl %eax, (%edi) 129 leal 4(%edi), %edi 130 131 leal 4(%edx), %edx 132 loop L(simple) 133 134 135 movl $0, %eax 136 popl %edi 137 138 setc %al 139 140 popl %ebx 141 ret 142 143 144C ----------------------------------------------------------------------------- 145L(unroll): 146 C eax carry 147 C ebx src1 148 C ecx counter 149 C edx src2 150 C esi 151 C edi dst 152 C ebp 153 154 cmpl %edi, %ebx 155 pushl %esi 156 157 je L(inplace) 158 159ifdef(`OPERATION_add_n',` 160 cmpl %edi, %edx 161 162 je L(inplace_reverse) 163') 164 165 movl %ecx, %esi 166 167 andl $-4, %ecx 168 andl $3, %esi 169 170 leal (%ebx,%ecx,4), %ebx 171 leal (%edx,%ecx,4), %edx 172 leal (%edi,%ecx,4), %edi 173 174 negl %ecx 175 shrl %eax 176 177 ALIGN(32) 178L(normal_top): 179 C eax counter, qwords, negative 180 C ebx src1 181 C ecx scratch 182 C edx src2 183 C esi 184 C edi dst 185 C ebp 186 187 movl (%ebx,%ecx,4), %eax 188 leal 5(%ecx), %ecx 189 M4_inst -20(%edx,%ecx,4), %eax 190 movl %eax, -20(%edi,%ecx,4) 191 192 movl 4-20(%ebx,%ecx,4), %eax 193 M4_inst 4-20(%edx,%ecx,4), %eax 194 movl %eax, 4-20(%edi,%ecx,4) 195 196 movl 8-20(%ebx,%ecx,4), %eax 197 M4_inst 8-20(%edx,%ecx,4), %eax 198 movl %eax, 8-20(%edi,%ecx,4) 199 200 movl 12-20(%ebx,%ecx,4), %eax 201 M4_inst 12-20(%edx,%ecx,4), %eax 202 movl %eax, 12-20(%edi,%ecx,4) 203 204 loop L(normal_top) 205 206 207 decl %esi 208 jz L(normal_finish_one) 209 js L(normal_done) 210 211 C two or three more limbs 212 213 movl (%ebx), %eax 214 M4_inst (%edx), %eax 215 movl %eax, (%edi) 216 217 movl 4(%ebx), %eax 218 M4_inst 4(%edx), %eax 219 decl %esi 220 movl %eax, 4(%edi) 221 222 jz L(normal_done) 223 movl $2, %ecx 224 225L(normal_finish_one): 226 movl (%ebx,%ecx,4), %eax 227 M4_inst (%edx,%ecx,4), %eax 228 movl %eax, (%edi,%ecx,4) 229 230L(normal_done): 231 popl %esi 232 popl %edi 233 234 movl $0, %eax 235 popl %ebx 236 237 setc %al 238 239 ret 240 241 242C ----------------------------------------------------------------------------- 243 244ifdef(`OPERATION_add_n',` 245L(inplace_reverse): 246 C dst==src2 247 248 movl %ebx, %edx 249') 250 251L(inplace): 252 C eax initial carry 253 C ebx 254 C ecx size 255 C edx src 256 C esi 257 C edi dst 258 C ebp 259 260 leal -1(%ecx), %esi 261 decl %ecx 262 263 andl $-4, %ecx 264 andl $3, %esi 265 266 movl (%edx), %ebx C src low limb 267 leal (%edx,%ecx,4), %edx 268 269 leal (%edi,%ecx,4), %edi 270 negl %ecx 271 272 shrl %eax 273 274 275 ALIGN(32) 276L(inplace_top): 277 C eax 278 C ebx next src limb 279 C ecx size 280 C edx src 281 C esi 282 C edi dst 283 C ebp 284 285 M4_inst %ebx, (%edi,%ecx,4) 286 287 movl 4(%edx,%ecx,4), %eax 288 leal 5(%ecx), %ecx 289 290 M4_inst %eax, 4-20(%edi,%ecx,4) 291 292 movl 8-20(%edx,%ecx,4), %eax 293 movl 12-20(%edx,%ecx,4), %ebx 294 295 M4_inst %eax, 8-20(%edi,%ecx,4) 296 M4_inst %ebx, 12-20(%edi,%ecx,4) 297 298 movl 16-20(%edx,%ecx,4), %ebx 299 loop L(inplace_top) 300 301 302 C now %esi is 0 to 3 representing respectively 1 to 4 limbs more 303 304 M4_inst %ebx, (%edi) 305 306 decl %esi 307 jz L(inplace_finish_one) 308 js L(inplace_done) 309 310 C two or three more limbs 311 312 movl 4(%edx), %eax 313 movl 8(%edx), %ebx 314 M4_inst %eax, 4(%edi) 315 M4_inst %ebx, 8(%edi) 316 317 decl %esi 318 movl $2, %ecx 319 320 jz L(normal_done) 321 322L(inplace_finish_one): 323 movl 4(%edx,%ecx,4), %eax 324 M4_inst %eax, 4(%edi,%ecx,4) 325 326L(inplace_done): 327 popl %esi 328 popl %edi 329 330 movl $0, %eax 331 popl %ebx 332 333 setc %al 334 335 ret 336 337EPILOGUE() 338