1dnl x86 generic mpn_sqr_basecase -- square an mpn number. 2 3dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23 24C cycles/crossproduct cycles/triangleproduct 25C P5 26C P6 27C K6 28C K7 29C P4 30 31 32C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size); 33C 34C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a 35C lot of function call overheads are avoided, especially when the size is 36C small. 37C 38C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the 39C code size to do so here. 40C 41C Enhancements: 42C 43C The addmul loop here is also not unrolled like aorsmul_1.asm and 44C mul_basecase.asm are. Perhaps it should be done. It'd add to the 45C complexity, but if it's worth doing in the other places then it should be 46C worthwhile here. 47C 48C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6) 49C might be worth considering. That'd add quite a bit to the code size, but 50C only as much as is used would be dragged into L1 cache. 51 52defframe(PARAM_SIZE,12) 53defframe(PARAM_SRC, 8) 54defframe(PARAM_DST, 4) 55 56 TEXT 57 ALIGN(8) 58PROLOGUE(mpn_sqr_basecase) 59deflit(`FRAME',0) 60 61 movl PARAM_SIZE, %edx 62 63 movl PARAM_SRC, %eax 64 65 cmpl $2, %edx 66 movl PARAM_DST, %ecx 67 68 je L(two_limbs) 69 ja L(three_or_more) 70 71 72C ----------------------------------------------------------------------------- 73C one limb only 74 C eax src 75 C ebx 76 C ecx dst 77 C edx 78 79 movl (%eax), %eax 80 mull %eax 81 movl %eax, (%ecx) 82 movl %edx, 4(%ecx) 83 ret 84 85 86C ----------------------------------------------------------------------------- 87 ALIGN(8) 88L(two_limbs): 89 C eax src 90 C ebx 91 C ecx dst 92 C edx 93 94 pushl %ebx 95 pushl %ebp 96 97 movl %eax, %ebx 98 movl (%eax), %eax 99 100 mull %eax C src[0]^2 101 102 pushl %esi 103 pushl %edi 104 105 movl %edx, %esi C dst[1] 106 movl %eax, (%ecx) C dst[0] 107 108 movl 4(%ebx), %eax 109 mull %eax C src[1]^2 110 111 movl %eax, %edi C dst[2] 112 movl %edx, %ebp C dst[3] 113 114 movl (%ebx), %eax 115 mull 4(%ebx) C src[0]*src[1] 116 117 addl %eax, %esi 118 119 adcl %edx, %edi 120 121 adcl $0, %ebp 122 addl %esi, %eax 123 124 adcl %edi, %edx 125 movl %eax, 4(%ecx) 126 127 adcl $0, %ebp 128 129 movl %edx, 8(%ecx) 130 movl %ebp, 12(%ecx) 131 132 popl %edi 133 popl %esi 134 135 popl %ebp 136 popl %ebx 137 138 ret 139 140 141C ----------------------------------------------------------------------------- 142 ALIGN(8) 143L(three_or_more): 144deflit(`FRAME',0) 145 C eax src 146 C ebx 147 C ecx dst 148 C edx size 149 150 pushl %ebx FRAME_pushl() 151 pushl %edi FRAME_pushl() 152 153 pushl %esi FRAME_pushl() 154 pushl %ebp FRAME_pushl() 155 156 leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1 157 leal (%eax,%edx,4), %esi C &src[size] 158 159C First multiply src[0]*src[1..size-1] and store at dst[1..size]. 160 161 movl (%eax), %ebp C src[0], multiplier 162 movl %edx, %ecx 163 164 negl %ecx C -size 165 xorl %ebx, %ebx C clear carry limb 166 167 incl %ecx C -(size-1) 168 169L(mul1): 170 C eax scratch 171 C ebx carry 172 C ecx counter, limbs, negative 173 C edx scratch 174 C esi &src[size] 175 C edi &dst[size] 176 C ebp multiplier 177 178 movl (%esi,%ecx,4), %eax 179 mull %ebp 180 addl %eax, %ebx 181 adcl $0, %edx 182 movl %ebx, (%edi,%ecx,4) 183 movl %edx, %ebx 184 incl %ecx 185 jnz L(mul1) 186 187 movl %ebx, (%edi) 188 189 190 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for 191 C n=1..size-2. 192 C 193 C The last products src[size-2]*src[size-1], which is the end corner 194 C of the product triangle, is handled separately at the end to save 195 C looping overhead. If size is 3 then it's only this that needs to 196 C be done. 197 C 198 C In the outer loop %esi is a constant, and %edi just advances by 1 199 C limb each time. The size of the operation decreases by 1 limb 200 C each time. 201 202 C eax 203 C ebx carry (needing carry flag added) 204 C ecx 205 C edx 206 C esi &src[size] 207 C edi &dst[size] 208 C ebp 209 210 movl PARAM_SIZE, %ecx 211 subl $3, %ecx 212 jz L(corner) 213 214 negl %ecx 215 216dnl re-use parameter space 217define(VAR_OUTER,`PARAM_DST') 218 219L(outer): 220 C eax 221 C ebx 222 C ecx 223 C edx outer loop counter, -(size-3) to -1 224 C esi &src[size] 225 C edi dst, pointing at stored carry limb of previous loop 226 C ebp 227 228 movl %ecx, VAR_OUTER 229 addl $4, %edi C advance dst end 230 231 movl -8(%esi,%ecx,4), %ebp C next multiplier 232 subl $1, %ecx 233 234 xorl %ebx, %ebx C initial carry limb 235 236L(inner): 237 C eax scratch 238 C ebx carry (needing carry flag added) 239 C ecx counter, -n-1 to -1 240 C edx scratch 241 C esi &src[size] 242 C edi dst end of this addmul 243 C ebp multiplier 244 245 movl (%esi,%ecx,4), %eax 246 mull %ebp 247 addl %ebx, %eax 248 adcl $0, %edx 249 addl %eax, (%edi,%ecx,4) 250 adcl $0, %edx 251 movl %edx, %ebx 252 addl $1, %ecx 253 jl L(inner) 254 255 256 movl %ebx, (%edi) 257 movl VAR_OUTER, %ecx 258 incl %ecx 259 jnz L(outer) 260 261 262L(corner): 263 C esi &src[size] 264 C edi &dst[2*size-3] 265 266 movl -4(%esi), %eax 267 mull -8(%esi) C src[size-1]*src[size-2] 268 addl %eax, 0(%edi) 269 adcl $0, %edx 270 movl %edx, 4(%edi) C dst high limb 271 272 273C ----------------------------------------------------------------------------- 274C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1]. 275 276 movl PARAM_SIZE, %eax 277 negl %eax 278 addl $1, %eax C -(size-1) and clear carry 279 280L(lshift): 281 C eax counter, negative 282 C ebx next limb 283 C ecx 284 C edx 285 C esi 286 C edi &dst[2*size-4] 287 C ebp 288 289 rcll 8(%edi,%eax,8) 290 rcll 12(%edi,%eax,8) 291 incl %eax 292 jnz L(lshift) 293 294 295 adcl %eax, %eax C high bit out 296 movl %eax, 8(%edi) C dst most significant limb 297 298 299C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ..., 300C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the 301C low limb of src[0]^2. 302 303 movl PARAM_SRC, %esi 304 movl (%esi), %eax C src[0] 305 mull %eax C src[0]^2 306 307 movl PARAM_SIZE, %ecx 308 leal (%esi,%ecx,4), %esi C src end 309 310 negl %ecx C -size 311 movl %edx, %ebx C initial carry 312 313 movl %eax, 12(%edi,%ecx,8) C dst[0] 314 incl %ecx C -(size-1) 315 316L(diag): 317 C eax scratch (low product) 318 C ebx carry limb 319 C ecx counter, -(size-1) to -1 320 C edx scratch (high product) 321 C esi &src[size] 322 C edi &dst[2*size-3] 323 C ebp scratch (fetched dst limbs) 324 325 movl (%esi,%ecx,4), %eax 326 mull %eax 327 328 addl %ebx, 8(%edi,%ecx,8) 329 movl %edx, %ebx 330 331 adcl %eax, 12(%edi,%ecx,8) 332 adcl $0, %ebx 333 334 incl %ecx 335 jnz L(diag) 336 337 338 addl %ebx, 8(%edi) C dst most significant limb 339 340 popl %ebp 341 popl %esi 342 343 popl %edi 344 popl %ebx 345 346 ret 347 348EPILOGUE() 349