1dnl Intel P5 mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular 24C product at around 20x20 limbs. 25 26 27C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 28C 29C Calculate src,size squared, storing the result in dst,2*size. 30C 31C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 32C lot of function call overheads are avoided, especially when the size is 33C small. 34 35defframe(PARAM_SIZE,12) 36defframe(PARAM_SRC, 8) 37defframe(PARAM_DST, 4) 38 39 TEXT 40 ALIGN(8) 41PROLOGUE(mpn_sqr_basecase) 42deflit(`FRAME',0) 43 44 movl PARAM_SIZE, %edx 45 movl PARAM_SRC, %eax 46 47 cmpl $2, %edx 48 movl PARAM_DST, %ecx 49 50 je L(two_limbs) 51 52 movl (%eax), %eax 53 ja L(three_or_more) 54 55C ----------------------------------------------------------------------------- 56C one limb only 57 C eax src 58 C ebx 59 C ecx dst 60 C edx 61 62 mull %eax 63 64 movl %eax, (%ecx) 65 movl %edx, 4(%ecx) 66 67 ret 68 69C ----------------------------------------------------------------------------- 70 ALIGN(8) 71L(two_limbs): 72 C eax src 73 C ebx 74 C ecx dst 75 C edx size 76 77 pushl %ebp 78 pushl %edi 79 80 pushl %esi 81 pushl %ebx 82 83 movl %eax, %ebx 84 movl (%eax), %eax 85 86 mull %eax C src[0]^2 87 88 movl %eax, (%ecx) C dst[0] 89 movl %edx, %esi C dst[1] 90 91 movl 4(%ebx), %eax 92 93 mull %eax C src[1]^2 94 95 movl %eax, %edi C dst[2] 96 movl %edx, %ebp C dst[3] 97 98 movl (%ebx), %eax 99 100 mull 4(%ebx) C src[0]*src[1] 101 102 addl %eax, %esi 103 popl %ebx 104 105 adcl %edx, %edi 106 107 adcl $0, %ebp 108 addl %esi, %eax 109 110 adcl %edi, %edx 111 movl %eax, 4(%ecx) 112 113 adcl $0, %ebp 114 popl %esi 115 116 movl %edx, 8(%ecx) 117 movl %ebp, 12(%ecx) 118 119 popl %edi 120 popl %ebp 121 122 ret 123 124 125C ----------------------------------------------------------------------------- 126 ALIGN(8) 127L(three_or_more): 128 C eax src low limb 129 C ebx 130 C ecx dst 131 C edx size 132 133 cmpl $4, %edx 134 pushl %ebx 135deflit(`FRAME',4) 136 137 movl PARAM_SRC, %ebx 138 jae L(four_or_more) 139 140 141C ----------------------------------------------------------------------------- 142C three limbs 143 C eax src low limb 144 C ebx src 145 C ecx dst 146 C edx size 147 148 pushl %ebp 149 pushl %edi 150 151 mull %eax C src[0] ^ 2 152 153 movl %eax, (%ecx) 154 movl %edx, 4(%ecx) 155 156 movl 4(%ebx), %eax 157 xorl %ebp, %ebp 158 159 mull %eax C src[1] ^ 2 160 161 movl %eax, 8(%ecx) 162 movl %edx, 12(%ecx) 163 164 movl 8(%ebx), %eax 165 pushl %esi C risk of cache bank clash 166 167 mull %eax C src[2] ^ 2 168 169 movl %eax, 16(%ecx) 170 movl %edx, 20(%ecx) 171 172 movl (%ebx), %eax 173 174 mull 4(%ebx) C src[0] * src[1] 175 176 movl %eax, %esi 177 movl %edx, %edi 178 179 movl (%ebx), %eax 180 181 mull 8(%ebx) C src[0] * src[2] 182 183 addl %eax, %edi 184 movl %edx, %ebp 185 186 adcl $0, %ebp 187 movl 4(%ebx), %eax 188 189 mull 8(%ebx) C src[1] * src[2] 190 191 xorl %ebx, %ebx 192 addl %eax, %ebp 193 194 C eax 195 C ebx zero, will be dst[5] 196 C ecx dst 197 C edx dst[4] 198 C esi dst[1] 199 C edi dst[2] 200 C ebp dst[3] 201 202 adcl $0, %edx 203 addl %esi, %esi 204 205 adcl %edi, %edi 206 207 adcl %ebp, %ebp 208 209 adcl %edx, %edx 210 movl 4(%ecx), %eax 211 212 adcl $0, %ebx 213 addl %esi, %eax 214 215 movl %eax, 4(%ecx) 216 movl 8(%ecx), %eax 217 218 adcl %edi, %eax 219 movl 12(%ecx), %esi 220 221 adcl %ebp, %esi 222 movl 16(%ecx), %edi 223 224 movl %eax, 8(%ecx) 225 movl %esi, 12(%ecx) 226 227 adcl %edx, %edi 228 popl %esi 229 230 movl 20(%ecx), %eax 231 movl %edi, 16(%ecx) 232 233 popl %edi 234 popl %ebp 235 236 adcl %ebx, %eax C no carry out of this 237 popl %ebx 238 239 movl %eax, 20(%ecx) 240 241 ret 242 243 244C ----------------------------------------------------------------------------- 245 ALIGN(8) 246L(four_or_more): 247 C eax src low limb 248 C ebx src 249 C ecx dst 250 C edx size 251 C esi 252 C edi 253 C ebp 254 C 255 C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 256 257deflit(`FRAME',4) 258 259 pushl %edi 260FRAME_pushl() 261 pushl %esi 262FRAME_pushl() 263 264 pushl %ebp 265FRAME_pushl() 266 leal (%ecx,%edx,4), %edi C dst end of this mul1 267 268 leal (%ebx,%edx,4), %esi C src end 269 movl %ebx, %ebp C src 270 271 negl %edx C -size 272 xorl %ebx, %ebx C clear carry limb and carry flag 273 274 leal 1(%edx), %ecx C -(size-1) 275 276L(mul1): 277 C eax scratch 278 C ebx carry 279 C ecx counter, negative 280 C edx scratch 281 C esi &src[size] 282 C edi &dst[size] 283 C ebp src 284 285 adcl $0, %ebx 286 movl (%esi,%ecx,4), %eax 287 288 mull (%ebp) 289 290 addl %eax, %ebx 291 292 movl %ebx, (%edi,%ecx,4) 293 incl %ecx 294 295 movl %edx, %ebx 296 jnz L(mul1) 297 298 299 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for 300 C n=1..size-2. 301 C 302 C The last two products, which are the end corner of the product 303 C triangle, are handled separately to save looping overhead. These 304 C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1]. 305 C If size is 4 then it's only these that need to be done. 306 C 307 C In the outer loop %esi is a constant, and %edi just advances by 1 308 C limb each time. The size of the operation decreases by 1 limb 309 C each time. 310 311 C eax 312 C ebx carry (needing carry flag added) 313 C ecx 314 C edx 315 C esi &src[size] 316 C edi &dst[size] 317 C ebp 318 319 adcl $0, %ebx 320 movl PARAM_SIZE, %edx 321 322 movl %ebx, (%edi) 323 subl $4, %edx 324 325 negl %edx 326 jz L(corner) 327 328 329L(outer): 330 C ebx previous carry limb to store 331 C edx outer loop counter (negative) 332 C esi &src[size] 333 C edi dst, pointing at stored carry limb of previous loop 334 335 pushl %edx C new outer loop counter 336 leal -2(%edx), %ecx 337 338 movl %ebx, (%edi) 339 addl $4, %edi 340 341 addl $4, %ebp 342 xorl %ebx, %ebx C initial carry limb, clear carry flag 343 344L(inner): 345 C eax scratch 346 C ebx carry (needing carry flag added) 347 C ecx counter, negative 348 C edx scratch 349 C esi &src[size] 350 C edi dst end of this addmul 351 C ebp &src[j] 352 353 adcl $0, %ebx 354 movl (%esi,%ecx,4), %eax 355 356 mull (%ebp) 357 358 addl %ebx, %eax 359 movl (%edi,%ecx,4), %ebx 360 361 adcl $0, %edx 362 addl %eax, %ebx 363 364 movl %ebx, (%edi,%ecx,4) 365 incl %ecx 366 367 movl %edx, %ebx 368 jnz L(inner) 369 370 371 adcl $0, %ebx 372 popl %edx C outer loop counter 373 374 incl %edx 375 jnz L(outer) 376 377 378 movl %ebx, (%edi) 379 380L(corner): 381 C esi &src[size] 382 C edi &dst[2*size-4] 383 384 movl -8(%esi), %eax 385 movl -4(%edi), %ebx C risk of data cache bank clash here 386 387 mull -12(%esi) C src[size-2]*src[size-3] 388 389 addl %eax, %ebx 390 movl %edx, %ecx 391 392 adcl $0, %ecx 393 movl -4(%esi), %eax 394 395 mull -12(%esi) C src[size-1]*src[size-3] 396 397 addl %ecx, %eax 398 movl (%edi), %ecx 399 400 adcl $0, %edx 401 movl %ebx, -4(%edi) 402 403 addl %eax, %ecx 404 movl %edx, %ebx 405 406 adcl $0, %ebx 407 movl -4(%esi), %eax 408 409 mull -8(%esi) C src[size-1]*src[size-2] 410 411 movl %ecx, (%edi) 412 addl %eax, %ebx 413 414 adcl $0, %edx 415 movl PARAM_SIZE, %eax 416 417 negl %eax 418 movl %ebx, 4(%edi) 419 420 addl $1, %eax C -(size-1) and clear carry 421 movl %edx, 8(%edi) 422 423 424C ----------------------------------------------------------------------------- 425C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 426 427L(lshift): 428 C eax counter, negative 429 C ebx next limb 430 C ecx 431 C edx 432 C esi 433 C edi &dst[2*size-4] 434 C ebp 435 436 movl 12(%edi,%eax,8), %ebx 437 438 rcll %ebx 439 movl 16(%edi,%eax,8), %ecx 440 441 rcll %ecx 442 movl %ebx, 12(%edi,%eax,8) 443 444 movl %ecx, 16(%edi,%eax,8) 445 incl %eax 446 447 jnz L(lshift) 448 449 450 adcl %eax, %eax C high bit out 451 movl PARAM_SRC, %esi 452 453 movl PARAM_SIZE, %ecx C risk of cache bank clash 454 movl %eax, 12(%edi) C dst most significant limb 455 456 457C ----------------------------------------------------------------------------- 458C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., 459C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 460C low limb of src[0]^2. 461 462 movl (%esi), %eax C src[0] 463 leal (%esi,%ecx,4), %esi C src end 464 465 negl %ecx 466 467 mull %eax 468 469 movl %eax, 16(%edi,%ecx,8) C dst[0] 470 movl %edx, %ebx 471 472 addl $1, %ecx C size-1 and clear carry 473 474L(diag): 475 C eax scratch (low product) 476 C ebx carry limb 477 C ecx counter, negative 478 C edx scratch (high product) 479 C esi &src[size] 480 C edi &dst[2*size-4] 481 C ebp scratch (fetched dst limbs) 482 483 movl (%esi,%ecx,4), %eax 484 adcl $0, %ebx 485 486 mull %eax 487 488 movl 16-4(%edi,%ecx,8), %ebp 489 490 addl %ebp, %ebx 491 movl 16(%edi,%ecx,8), %ebp 492 493 adcl %eax, %ebp 494 movl %ebx, 16-4(%edi,%ecx,8) 495 496 movl %ebp, 16(%edi,%ecx,8) 497 incl %ecx 498 499 movl %edx, %ebx 500 jnz L(diag) 501 502 503 adcl $0, %edx 504 movl 16-4(%edi), %eax C dst most significant limb 505 506 addl %eax, %edx 507 popl %ebp 508 509 movl %edx, 16-4(%edi) 510 popl %esi C risk of cache bank clash 511 512 popl %edi 513 popl %ebx 514 515 ret 516 517EPILOGUE() 518