1dnl x86 generic mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34 35C cycles/crossproduct cycles/triangleproduct 36C P5 37C P6 38C K6 39C K7 40C P4 41 42 43C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 44C 45C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 46C lot of function call overheads are avoided, especially when the size is 47C small. 48C 49C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the 50C code size to do so here. 51C 52C Enhancements: 53C 54C The addmul loop here is also not unrolled like aorsmul_1.asm and 55C mul_basecase.asm are. Perhaps it should be done. It'd add to the 56C complexity, but if it's worth doing in the other places then it should be 57C worthwhile here. 58C 59C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6) 60C might be worth considering. That'd add quite a bit to the code size, but 61C only as much as is used would be dragged into L1 cache. 62 63defframe(PARAM_SIZE,12) 64defframe(PARAM_SRC, 8) 65defframe(PARAM_DST, 4) 66 67 TEXT 68 ALIGN(8) 69PROLOGUE(mpn_sqr_basecase) 70deflit(`FRAME',0) 71 72 movl PARAM_SIZE, %edx 73 74 movl PARAM_SRC, %eax 75 76 cmpl $2, %edx 77 movl PARAM_DST, %ecx 78 79 je L(two_limbs) 80 ja L(three_or_more) 81 82 83C ----------------------------------------------------------------------------- 84C one limb only 85 C eax src 86 C ebx 87 C ecx dst 88 C edx 89 90 movl (%eax), %eax 91 mull %eax 92 movl %eax, (%ecx) 93 movl %edx, 4(%ecx) 94 ret 95 96 97C ----------------------------------------------------------------------------- 98 ALIGN(8) 99L(two_limbs): 100 C eax src 101 C ebx 102 C ecx dst 103 C edx 104 105 pushl %ebx 106 pushl %ebp 107 108 movl %eax, %ebx 109 movl (%eax), %eax 110 111 mull %eax C src[0]^2 112 113 pushl %esi 114 pushl %edi 115 116 movl %edx, %esi C dst[1] 117 movl %eax, (%ecx) C dst[0] 118 119 movl 4(%ebx), %eax 120 mull %eax C src[1]^2 121 122 movl %eax, %edi C dst[2] 123 movl %edx, %ebp C dst[3] 124 125 movl (%ebx), %eax 126 mull 4(%ebx) C src[0]*src[1] 127 128 addl %eax, %esi 129 130 adcl %edx, %edi 131 132 adcl $0, %ebp 133 addl %esi, %eax 134 135 adcl %edi, %edx 136 movl %eax, 4(%ecx) 137 138 adcl $0, %ebp 139 140 movl %edx, 8(%ecx) 141 movl %ebp, 12(%ecx) 142 143 popl %edi 144 popl %esi 145 146 popl %ebp 147 popl %ebx 148 149 ret 150 151 152C ----------------------------------------------------------------------------- 153 ALIGN(8) 154L(three_or_more): 155deflit(`FRAME',0) 156 C eax src 157 C ebx 158 C ecx dst 159 C edx size 160 161 pushl %ebx FRAME_pushl() 162 pushl %edi FRAME_pushl() 163 164 pushl %esi FRAME_pushl() 165 pushl %ebp FRAME_pushl() 166 167 leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1 168 leal (%eax,%edx,4), %esi C &src[size] 169 170C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 171 172 movl (%eax), %ebp C src[0], multiplier 173 movl %edx, %ecx 174 175 negl %ecx C -size 176 xorl %ebx, %ebx C clear carry limb 177 178 incl %ecx C -(size-1) 179 180L(mul1): 181 C eax scratch 182 C ebx carry 183 C ecx counter, limbs, negative 184 C edx scratch 185 C esi &src[size] 186 C edi &dst[size] 187 C ebp multiplier 188 189 movl (%esi,%ecx,4), %eax 190 mull %ebp 191 addl %eax, %ebx 192 adcl $0, %edx 193 movl %ebx, (%edi,%ecx,4) 194 movl %edx, %ebx 195 incl %ecx 196 jnz L(mul1) 197 198 movl %ebx, (%edi) 199 200 201 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for 202 C n=1..size-2. 203 C 204 C The last products src[size-2]*src[size-1], which is the end corner 205 C of the product triangle, is handled separately at the end to save 206 C looping overhead. If size is 3 then it's only this that needs to 207 C be done. 208 C 209 C In the outer loop %esi is a constant, and %edi just advances by 1 210 C limb each time. The size of the operation decreases by 1 limb 211 C each time. 212 213 C eax 214 C ebx carry (needing carry flag added) 215 C ecx 216 C edx 217 C esi &src[size] 218 C edi &dst[size] 219 C ebp 220 221 movl PARAM_SIZE, %ecx 222 subl $3, %ecx 223 jz L(corner) 224 225 negl %ecx 226 227dnl re-use parameter space 228define(VAR_OUTER,`PARAM_DST') 229 230L(outer): 231 C eax 232 C ebx 233 C ecx 234 C edx outer loop counter, -(size-3) to -1 235 C esi &src[size] 236 C edi dst, pointing at stored carry limb of previous loop 237 C ebp 238 239 movl %ecx, VAR_OUTER 240 addl $4, %edi C advance dst end 241 242 movl -8(%esi,%ecx,4), %ebp C next multiplier 243 subl $1, %ecx 244 245 xorl %ebx, %ebx C initial carry limb 246 247L(inner): 248 C eax scratch 249 C ebx carry (needing carry flag added) 250 C ecx counter, -n-1 to -1 251 C edx scratch 252 C esi &src[size] 253 C edi dst end of this addmul 254 C ebp multiplier 255 256 movl (%esi,%ecx,4), %eax 257 mull %ebp 258 addl %ebx, %eax 259 adcl $0, %edx 260 addl %eax, (%edi,%ecx,4) 261 adcl $0, %edx 262 movl %edx, %ebx 263 addl $1, %ecx 264 jl L(inner) 265 266 267 movl %ebx, (%edi) 268 movl VAR_OUTER, %ecx 269 incl %ecx 270 jnz L(outer) 271 272 273L(corner): 274 C esi &src[size] 275 C edi &dst[2*size-3] 276 277 movl -4(%esi), %eax 278 mull -8(%esi) C src[size-1]*src[size-2] 279 addl %eax, 0(%edi) 280 adcl $0, %edx 281 movl %edx, 4(%edi) C dst high limb 282 283 284C ----------------------------------------------------------------------------- 285C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 286 287 movl PARAM_SIZE, %eax 288 negl %eax 289 addl $1, %eax C -(size-1) and clear carry 290 291L(lshift): 292 C eax counter, negative 293 C ebx next limb 294 C ecx 295 C edx 296 C esi 297 C edi &dst[2*size-4] 298 C ebp 299 300 rcll 8(%edi,%eax,8) 301 rcll 12(%edi,%eax,8) 302 incl %eax 303 jnz L(lshift) 304 305 306 adcl %eax, %eax C high bit out 307 movl %eax, 8(%edi) C dst most significant limb 308 309 310C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., 311C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 312C low limb of src[0]^2. 313 314 movl PARAM_SRC, %esi 315 movl (%esi), %eax C src[0] 316 mull %eax C src[0]^2 317 318 movl PARAM_SIZE, %ecx 319 leal (%esi,%ecx,4), %esi C src end 320 321 negl %ecx C -size 322 movl %edx, %ebx C initial carry 323 324 movl %eax, 12(%edi,%ecx,8) C dst[0] 325 incl %ecx C -(size-1) 326 327L(diag): 328 C eax scratch (low product) 329 C ebx carry limb 330 C ecx counter, -(size-1) to -1 331 C edx scratch (high product) 332 C esi &src[size] 333 C edi &dst[2*size-3] 334 C ebp scratch (fetched dst limbs) 335 336 movl (%esi,%ecx,4), %eax 337 mull %eax 338 339 addl %ebx, 8(%edi,%ecx,8) 340 movl %edx, %ebx 341 342 adcl %eax, 12(%edi,%ecx,8) 343 adcl $0, %ebx 344 345 incl %ecx 346 jnz L(diag) 347 348 349 addl %ebx, 8(%edi) C dst most significant limb 350 351 popl %ebp 352 popl %esi 353 354 popl %edi 355 popl %ebx 356 357 ret 358 359EPILOGUE() 360