1dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract. 2 3dnl Copyright 1999-2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K7: 1.64 cycles/limb (at 16 limbs/loop). 35 36 37 38dnl K7: UNROLL_COUNT cycles/limb 39dnl 8 1.9 40dnl 16 1.64 41dnl 32 1.7 42dnl 64 2.0 43dnl Maximum possible with the current code is 64. 44 45deflit(UNROLL_COUNT, 16) 46 47 48ifdef(`OPERATION_add_n', ` 49 define(M4_inst, adcl) 50 define(M4_function_n, mpn_add_n) 51 define(M4_function_nc, mpn_add_nc) 52 define(M4_description, add) 53',`ifdef(`OPERATION_sub_n', ` 54 define(M4_inst, sbbl) 55 define(M4_function_n, mpn_sub_n) 56 define(M4_function_nc, mpn_sub_nc) 57 define(M4_description, subtract) 58',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n 59')')') 60 61MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 62 63 64C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 65C mp_size_t size); 66C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 67C mp_size_t size, mp_limb_t carry); 68C 69C Calculate src1,size M4_description src2,size, and store the result in 70C dst,size. The return value is the carry bit from the top of the result (1 71C or 0). 72C 73C The _nc version accepts 1 or 0 for an initial carry into the low limb of 74C the calculation. Note values other than 1 or 0 here will lead to garbage 75C results. 76C 77C This code runs at 1.64 cycles/limb, which might be the best possible with 78C plain integer operations. Each limb is 2 loads and 1 store, any 2 of 79C which can be done each cycle, leading to 1.5 c/l. 80 81dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1. 82ifdef(`PIC',` 83deflit(UNROLL_THRESHOLD, 8) 84',` 85deflit(UNROLL_THRESHOLD, 8) 86') 87 88defframe(PARAM_CARRY,20) 89defframe(PARAM_SIZE, 16) 90defframe(PARAM_SRC2, 12) 91defframe(PARAM_SRC1, 8) 92defframe(PARAM_DST, 4) 93 94defframe(SAVE_EBP, -4) 95defframe(SAVE_ESI, -8) 96defframe(SAVE_EBX, -12) 97defframe(SAVE_EDI, -16) 98deflit(STACK_SPACE, 16) 99 100 TEXT 101 ALIGN(32) 102deflit(`FRAME',0) 103 104PROLOGUE(M4_function_nc) 105 movl PARAM_CARRY, %eax 106 jmp L(start) 107EPILOGUE() 108 109PROLOGUE(M4_function_n) 110 111 xorl %eax, %eax C carry 112L(start): 113 movl PARAM_SIZE, %ecx 114 subl $STACK_SPACE, %esp 115deflit(`FRAME',STACK_SPACE) 116 117 movl %edi, SAVE_EDI 118 movl %ebx, SAVE_EBX 119 cmpl $UNROLL_THRESHOLD, %ecx 120 121 movl PARAM_SRC2, %edx 122 movl PARAM_SRC1, %ebx 123 jae L(unroll) 124 125 movl PARAM_DST, %edi 126 leal (%ebx,%ecx,4), %ebx 127 leal (%edx,%ecx,4), %edx 128 129 leal (%edi,%ecx,4), %edi 130 negl %ecx 131 shrl %eax 132 133 C This loop in in a single 16 byte code block already, so no 134 C alignment necessary. 135L(simple): 136 C eax scratch 137 C ebx src1 138 C ecx counter 139 C edx src2 140 C esi 141 C edi dst 142 C ebp 143 144 movl (%ebx,%ecx,4), %eax 145 M4_inst (%edx,%ecx,4), %eax 146 movl %eax, (%edi,%ecx,4) 147 incl %ecx 148 jnz L(simple) 149 150 movl $0, %eax 151 movl SAVE_EDI, %edi 152 153 movl SAVE_EBX, %ebx 154 setc %al 155 addl $STACK_SPACE, %esp 156 157 ret 158 159 160C ----------------------------------------------------------------------------- 161 C This is at 0x55, close enough to aligned. 162L(unroll): 163deflit(`FRAME',STACK_SPACE) 164 movl %ebp, SAVE_EBP 165 andl $-2, %ecx C size low bit masked out 166 andl $1, PARAM_SIZE C size low bit kept 167 168 movl %ecx, %edi 169 decl %ecx 170 movl PARAM_DST, %ebp 171 172 shrl $UNROLL_LOG2, %ecx 173 negl %edi 174 movl %esi, SAVE_ESI 175 176 andl $UNROLL_MASK, %edi 177 178ifdef(`PIC',` 179 call L(pic_calc) 180L(here): 181',` 182 leal L(entry) (%edi,%edi,8), %esi C 9 bytes per 183') 184 negl %edi 185 shrl %eax 186 187 leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx 188 leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx 189 leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi 190 191 jmp *%esi 192 193 194ifdef(`PIC',` 195L(pic_calc): 196 C See mpn/x86/README about old gas bugs 197 leal (%edi,%edi,8), %esi 198 addl $L(entry)-L(here), %esi 199 addl (%esp), %esi 200 ret_internal 201') 202 203 204C ----------------------------------------------------------------------------- 205 ALIGN(32) 206L(top): 207 C eax zero 208 C ebx src1 209 C ecx counter 210 C edx src2 211 C esi scratch (was computed jump) 212 C edi dst 213 C ebp scratch 214 215 leal UNROLL_BYTES(%edx), %edx 216 217L(entry): 218deflit(CHUNK_COUNT, 2) 219forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, ` 220 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128))) 221 deflit(`disp1', eval(disp0 + 4)) 222 223Zdisp( movl, disp0,(%ebx), %esi) 224 movl disp1(%ebx), %ebp 225Zdisp( M4_inst,disp0,(%edx), %esi) 226Zdisp( movl, %esi, disp0,(%edi)) 227 M4_inst disp1(%edx), %ebp 228 movl %ebp, disp1(%edi) 229') 230 231 decl %ecx 232 leal UNROLL_BYTES(%ebx), %ebx 233 leal UNROLL_BYTES(%edi), %edi 234 jns L(top) 235 236 237 mov PARAM_SIZE, %esi 238 movl SAVE_EBP, %ebp 239 movl $0, %eax 240 241 decl %esi 242 js L(even) 243 244 movl (%ebx), %ecx 245 M4_inst UNROLL_BYTES(%edx), %ecx 246 movl %ecx, (%edi) 247L(even): 248 249 movl SAVE_EDI, %edi 250 movl SAVE_EBX, %ebx 251 setc %al 252 253 movl SAVE_ESI, %esi 254 addl $STACK_SPACE, %esp 255 256 ret 257 258EPILOGUE() 259