1dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add 2dnl the result to a second limb vector. 3 4dnl Copyright 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation, 5dnl Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C UltraSPARC 1&2: 14 26C UltraSPARC 3: 17.5 27 28C Algorithm: We use eight floating-point multiplies per limb product, with the 29C invariant v operand split into four 16-bit pieces, and the up operand split 30C into 32-bit pieces. We sum pairs of 48-bit partial products using 31C floating-point add, then convert the four 49-bit product-sums and transfer 32C them to the integer unit. 33 34C Possible optimizations: 35C 0. Rewrite to use algorithm of mpn_addmul_2. 36C 1. Align the stack area where we transfer the four 49-bit product-sums 37C to a 32-byte boundary. That would minimize the cache collision. 38C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would 39C be to align the area to map to the area immediately before up?) 40C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the 41C develop mpn_addmul_2. This would save many integer instructions. 42C 3. Unrolling. Questionable if it is worth the code expansion, given that 43C it could only save 1 cycle/limb. 44C 4. Specialize for particular v values. If its upper 32 bits are zero, we 45C could save many operations, in the FPU (fmuld), but more so in the IEU 46C since we'll be summing 48-bit quantities, which might be simpler. 47C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and 48C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should 49C not be greater than needed for L2 cache latency, and also not so great 50C that i16 needs to be copied. 51C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want 52C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU 53C ops.) 54 55C Instruction classification (as per UltraSPARC-1/2 functional units): 56C 8 FM 57C 10 FA 58C 12 MEM 59C 10 ISHIFT + 14 IADDLOG 60C 1 BRANCH 61C 55 insns totally (plus one mov insn that should be optimized out) 62 63C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we 64C sustain the peak execution rate of 4 instructions/cycle. 65 66C INPUT PARAMETERS 67C rp i0 68C up i1 69C n i2 70C v i3 71 72ASM_START() 73 REGISTER(%g2,#scratch) 74 REGISTER(%g3,#scratch) 75 76define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') 77define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') 78define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') 79define(`u00',`%f32') define(`u32', `%f34') 80define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') 81define(`cy',`%g1') 82define(`rlimb',`%g3') 83define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') 84define(`xffffffff',`%l7') 85define(`xffff',`%o0') 86 87PROLOGUE(mpn_addmul_1) 88 89C Initialization. (1) Split v operand into four 16-bit chunks and store them 90C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs 91C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. 92 93 save %sp, -256, %sp 94 mov -1, %g4 95 srlx %g4, 48, xffff C store mask in register `xffff' 96 and %i3, xffff, %g2 97 stx %g2, [%sp+2223+0] 98 srlx %i3, 16, %g3 99 and %g3, xffff, %g3 100 stx %g3, [%sp+2223+8] 101 srlx %i3, 32, %g2 102 and %g2, xffff, %g2 103 stx %g2, [%sp+2223+16] 104 srlx %i3, 48, %g3 105 stx %g3, [%sp+2223+24] 106 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 107 108 sllx %i2, 3, %i2 109 mov 0, cy C clear cy 110 add %i0, %i2, %i0 111 add %i1, %i2, %i1 112 neg %i2 113 add %i1, 4, %i5 114 add %i0, -32, %i4 115 add %i0, -16, %i0 116 117 ldd [%sp+2223+0], v00 118 ldd [%sp+2223+8], v16 119 ldd [%sp+2223+16], v32 120 ldd [%sp+2223+24], v48 121 ld [%sp+2223+0],%f2 C zero f2 122 ld [%sp+2223+0],%f4 C zero f4 123 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 124 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 125 fxtod v00, v00 126 fxtod v16, v16 127 fxtod v32, v32 128 fxtod v48, v48 129 130C Start real work. (We sneakingly read f3 and f5 above...) 131C The software pipeline is very deep, requiring 4 feed-in stages. 132 133 fxtod %f2, u00 134 fxtod %f4, u32 135 fmuld u00, v00, a00 136 fmuld u00, v16, a16 137 fmuld u00, v32, p32 138 fmuld u32, v00, r32 139 fmuld u00, v48, p48 140 addcc %i2, 8, %i2 141 bnz,pt %xcc, .L_two_or_more 142 fmuld u32, v16, r48 143 144.L_one: 145 fmuld u32, v32, r64 C FIXME not urgent 146 faddd p32, r32, a32 147 fdtox a00, a00 148 faddd p48, r48, a48 149 fmuld u32, v48, r80 C FIXME not urgent 150 fdtox a16, a16 151 fdtox a32, a32 152 fdtox a48, a48 153 std a00, [%sp+2223+0] 154 std a16, [%sp+2223+8] 155 std a32, [%sp+2223+16] 156 std a48, [%sp+2223+24] 157 add %i2, 8, %i2 158 159 fdtox r64, a00 160 ldx [%i0+%i2], rlimb C read rp[i] 161 fdtox r80, a16 162 ldx [%sp+2223+0], i00 163 ldx [%sp+2223+8], i16 164 ldx [%sp+2223+16], i32 165 ldx [%sp+2223+24], i48 166 std a00, [%sp+2223+0] 167 std a16, [%sp+2223+8] 168 add %i2, 8, %i2 169 170 srlx rlimb, 32, %g4 C HI(rlimb) 171 and rlimb, xffffffff, %g5 C LO(rlimb) 172 add i00, %g5, %g5 C i00+ now in g5 173 ldx [%sp+2223+0], i00 174 srlx i16, 48, %l4 C (i16 >> 48) 175 mov i16, %g2 176 ldx [%sp+2223+8], i16 177 srlx i48, 16, %l5 C (i48 >> 16) 178 add i32, %g4, %g4 C i32+ now in g4 179 sllx i48, 32, %l6 C (i48 << 32) 180 srlx %g4, 32, %o3 C (i32 >> 32) 181 add %l5, %l4, %o1 C hi64- in %o1 182 std a00, [%sp+2223+0] 183 sllx %g4, 16, %o2 C (i32 << 16) 184 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 185 std a16, [%sp+2223+8] 186 sllx %o1, 48, %o3 C (hi64 << 48) 187 add %g2, %o2, %o2 C mi64- in %o2 188 add %l6, %o2, %o2 C mi64- in %o2 189 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 190 add cy, %g5, %o4 C x = prev(i00) + cy 191 b .L_out_1 192 add %i2, 8, %i2 193 194.L_two_or_more: 195 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 196 fmuld u32, v32, r64 C FIXME not urgent 197 faddd p32, r32, a32 198 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 199 fdtox a00, a00 200 faddd p48, r48, a48 201 fmuld u32, v48, r80 C FIXME not urgent 202 fdtox a16, a16 203 fdtox a32, a32 204 fxtod %f2, u00 205 fxtod %f4, u32 206 fdtox a48, a48 207 std a00, [%sp+2223+0] 208 fmuld u00, v00, p00 209 std a16, [%sp+2223+8] 210 fmuld u00, v16, p16 211 std a32, [%sp+2223+16] 212 fmuld u00, v32, p32 213 std a48, [%sp+2223+24] 214 faddd p00, r64, a00 215 fmuld u32, v00, r32 216 faddd p16, r80, a16 217 fmuld u00, v48, p48 218 addcc %i2, 8, %i2 219 bnz,pt %xcc, .L_three_or_more 220 fmuld u32, v16, r48 221 222.L_two: 223 fmuld u32, v32, r64 C FIXME not urgent 224 faddd p32, r32, a32 225 fdtox a00, a00 226 ldx [%i0+%i2], rlimb C read rp[i] 227 faddd p48, r48, a48 228 fmuld u32, v48, r80 C FIXME not urgent 229 fdtox a16, a16 230 ldx [%sp+2223+0], i00 231 fdtox a32, a32 232 ldx [%sp+2223+8], i16 233 ldx [%sp+2223+16], i32 234 ldx [%sp+2223+24], i48 235 fdtox a48, a48 236 std a00, [%sp+2223+0] 237 std a16, [%sp+2223+8] 238 std a32, [%sp+2223+16] 239 std a48, [%sp+2223+24] 240 add %i2, 8, %i2 241 242 fdtox r64, a00 243 srlx rlimb, 32, %g4 C HI(rlimb) 244 and rlimb, xffffffff, %g5 C LO(rlimb) 245 ldx [%i0+%i2], rlimb C read rp[i] 246 add i00, %g5, %g5 C i00+ now in g5 247 fdtox r80, a16 248 ldx [%sp+2223+0], i00 249 srlx i16, 48, %l4 C (i16 >> 48) 250 mov i16, %g2 251 ldx [%sp+2223+8], i16 252 srlx i48, 16, %l5 C (i48 >> 16) 253 add i32, %g4, %g4 C i32+ now in g4 254 ldx [%sp+2223+16], i32 255 sllx i48, 32, %l6 C (i48 << 32) 256 ldx [%sp+2223+24], i48 257 srlx %g4, 32, %o3 C (i32 >> 32) 258 add %l5, %l4, %o1 C hi64- in %o1 259 std a00, [%sp+2223+0] 260 sllx %g4, 16, %o2 C (i32 << 16) 261 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 262 std a16, [%sp+2223+8] 263 sllx %o1, 48, %o3 C (hi64 << 48) 264 add %g2, %o2, %o2 C mi64- in %o2 265 add %l6, %o2, %o2 C mi64- in %o2 266 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 267 add cy, %g5, %o4 C x = prev(i00) + cy 268 b .L_out_2 269 add %i2, 8, %i2 270 271.L_three_or_more: 272 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 273 fmuld u32, v32, r64 C FIXME not urgent 274 faddd p32, r32, a32 275 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 276 fdtox a00, a00 277 ldx [%i0+%i2], rlimb C read rp[i] 278 faddd p48, r48, a48 279 fmuld u32, v48, r80 C FIXME not urgent 280 fdtox a16, a16 281 ldx [%sp+2223+0], i00 282 fdtox a32, a32 283 ldx [%sp+2223+8], i16 284 fxtod %f2, u00 285 ldx [%sp+2223+16], i32 286 fxtod %f4, u32 287 ldx [%sp+2223+24], i48 288 fdtox a48, a48 289 std a00, [%sp+2223+0] 290 fmuld u00, v00, p00 291 std a16, [%sp+2223+8] 292 fmuld u00, v16, p16 293 std a32, [%sp+2223+16] 294 fmuld u00, v32, p32 295 std a48, [%sp+2223+24] 296 faddd p00, r64, a00 297 fmuld u32, v00, r32 298 faddd p16, r80, a16 299 fmuld u00, v48, p48 300 addcc %i2, 8, %i2 301 bnz,pt %xcc, .L_four_or_more 302 fmuld u32, v16, r48 303 304.L_three: 305 fmuld u32, v32, r64 C FIXME not urgent 306 faddd p32, r32, a32 307 fdtox a00, a00 308 srlx rlimb, 32, %g4 C HI(rlimb) 309 and rlimb, xffffffff, %g5 C LO(rlimb) 310 ldx [%i0+%i2], rlimb C read rp[i] 311 faddd p48, r48, a48 312 add i00, %g5, %g5 C i00+ now in g5 313 fmuld u32, v48, r80 C FIXME not urgent 314 fdtox a16, a16 315 ldx [%sp+2223+0], i00 316 fdtox a32, a32 317 srlx i16, 48, %l4 C (i16 >> 48) 318 mov i16, %g2 319 ldx [%sp+2223+8], i16 320 srlx i48, 16, %l5 C (i48 >> 16) 321 add i32, %g4, %g4 C i32+ now in g4 322 ldx [%sp+2223+16], i32 323 sllx i48, 32, %l6 C (i48 << 32) 324 ldx [%sp+2223+24], i48 325 fdtox a48, a48 326 srlx %g4, 32, %o3 C (i32 >> 32) 327 add %l5, %l4, %o1 C hi64- in %o1 328 std a00, [%sp+2223+0] 329 sllx %g4, 16, %o2 C (i32 << 16) 330 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 331 std a16, [%sp+2223+8] 332 sllx %o1, 48, %o3 C (hi64 << 48) 333 add %g2, %o2, %o2 C mi64- in %o2 334 std a32, [%sp+2223+16] 335 add %l6, %o2, %o2 C mi64- in %o2 336 std a48, [%sp+2223+24] 337 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 338 add cy, %g5, %o4 C x = prev(i00) + cy 339 b .L_out_3 340 add %i2, 8, %i2 341 342.L_four_or_more: 343 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 344 fmuld u32, v32, r64 C FIXME not urgent 345 faddd p32, r32, a32 346 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 347 fdtox a00, a00 348 srlx rlimb, 32, %g4 C HI(rlimb) 349 and rlimb, xffffffff, %g5 C LO(rlimb) 350 ldx [%i0+%i2], rlimb C read rp[i] 351 faddd p48, r48, a48 352 add i00, %g5, %g5 C i00+ now in g5 353 fmuld u32, v48, r80 C FIXME not urgent 354 fdtox a16, a16 355 ldx [%sp+2223+0], i00 356 fdtox a32, a32 357 srlx i16, 48, %l4 C (i16 >> 48) 358 mov i16, %g2 359 ldx [%sp+2223+8], i16 360 fxtod %f2, u00 361 srlx i48, 16, %l5 C (i48 >> 16) 362 add i32, %g4, %g4 C i32+ now in g4 363 ldx [%sp+2223+16], i32 364 fxtod %f4, u32 365 sllx i48, 32, %l6 C (i48 << 32) 366 ldx [%sp+2223+24], i48 367 fdtox a48, a48 368 srlx %g4, 32, %o3 C (i32 >> 32) 369 add %l5, %l4, %o1 C hi64- in %o1 370 std a00, [%sp+2223+0] 371 fmuld u00, v00, p00 372 sllx %g4, 16, %o2 C (i32 << 16) 373 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 374 std a16, [%sp+2223+8] 375 fmuld u00, v16, p16 376 sllx %o1, 48, %o3 C (hi64 << 48) 377 add %g2, %o2, %o2 C mi64- in %o2 378 std a32, [%sp+2223+16] 379 fmuld u00, v32, p32 380 add %l6, %o2, %o2 C mi64- in %o2 381 std a48, [%sp+2223+24] 382 faddd p00, r64, a00 383 fmuld u32, v00, r32 384 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 385 faddd p16, r80, a16 386 fmuld u00, v48, p48 387 add cy, %g5, %o4 C x = prev(i00) + cy 388 addcc %i2, 8, %i2 389 bnz,pt %xcc, .Loop 390 fmuld u32, v16, r48 391 392.L_four: 393 b,a .L_out_4 394 395C BEGIN MAIN LOOP 396 .align 16 397.Loop: 398C 00 399 srlx %o4, 16, %o5 C (x >> 16) 400 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 401 fmuld u32, v32, r64 C FIXME not urgent 402 faddd p32, r32, a32 403C 01 404 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 405 and %o4, xffff, %o5 C (x & 0xffff) 406 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 407 fdtox a00, a00 408C 02 409 srlx rlimb, 32, %g4 C HI(rlimb) 410 and rlimb, xffffffff, %g5 C LO(rlimb) 411 ldx [%i0+%i2], rlimb C read rp[i] 412 faddd p48, r48, a48 413C 03 414 srlx %o2, 48, %o7 C (mi64 >> 48) 415 add i00, %g5, %g5 C i00+ now in g5 416 fmuld u32, v48, r80 C FIXME not urgent 417 fdtox a16, a16 418C 04 419 sllx %o2, 16, %i3 C (mi64 << 16) 420 add %o7, %o1, cy C new cy 421 ldx [%sp+2223+0], i00 422 fdtox a32, a32 423C 05 424 srlx i16, 48, %l4 C (i16 >> 48) 425 mov i16, %g2 426 ldx [%sp+2223+8], i16 427 fxtod %f2, u00 428C 06 429 srlx i48, 16, %l5 C (i48 >> 16) 430 add i32, %g4, %g4 C i32+ now in g4 431 ldx [%sp+2223+16], i32 432 fxtod %f4, u32 433C 07 434 sllx i48, 32, %l6 C (i48 << 32) 435 or %i3, %o5, %o5 436 ldx [%sp+2223+24], i48 437 fdtox a48, a48 438C 08 439 srlx %g4, 32, %o3 C (i32 >> 32) 440 add %l5, %l4, %o1 C hi64- in %o1 441 std a00, [%sp+2223+0] 442 fmuld u00, v00, p00 443C 09 444 sllx %g4, 16, %o2 C (i32 << 16) 445 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 446 std a16, [%sp+2223+8] 447 fmuld u00, v16, p16 448C 10 449 sllx %o1, 48, %o3 C (hi64 << 48) 450 add %g2, %o2, %o2 C mi64- in %o2 451 std a32, [%sp+2223+16] 452 fmuld u00, v32, p32 453C 11 454 add %l6, %o2, %o2 C mi64- in %o2 455 std a48, [%sp+2223+24] 456 faddd p00, r64, a00 457 fmuld u32, v00, r32 458C 12 459 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 460 stx %o5, [%i4+%i2] 461 faddd p16, r80, a16 462 fmuld u00, v48, p48 463C 13 464 add cy, %g5, %o4 C x = prev(i00) + cy 465 addcc %i2, 8, %i2 466 bnz,pt %xcc, .Loop 467 fmuld u32, v16, r48 468C END MAIN LOOP 469 470.L_out_4: 471 srlx %o4, 16, %o5 C (x >> 16) 472 fmuld u32, v32, r64 C FIXME not urgent 473 faddd p32, r32, a32 474 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 475 and %o4, xffff, %o5 C (x & 0xffff) 476 fdtox a00, a00 477 srlx rlimb, 32, %g4 C HI(rlimb) 478 and rlimb, xffffffff, %g5 C LO(rlimb) 479 ldx [%i0+%i2], rlimb C read rp[i] 480 faddd p48, r48, a48 481 srlx %o2, 48, %o7 C (mi64 >> 48) 482 add i00, %g5, %g5 C i00+ now in g5 483 fmuld u32, v48, r80 C FIXME not urgent 484 fdtox a16, a16 485 sllx %o2, 16, %i3 C (mi64 << 16) 486 add %o7, %o1, cy C new cy 487 ldx [%sp+2223+0], i00 488 fdtox a32, a32 489 srlx i16, 48, %l4 C (i16 >> 48) 490 mov i16, %g2 491 ldx [%sp+2223+8], i16 492 srlx i48, 16, %l5 C (i48 >> 16) 493 add i32, %g4, %g4 C i32+ now in g4 494 ldx [%sp+2223+16], i32 495 sllx i48, 32, %l6 C (i48 << 32) 496 or %i3, %o5, %o5 497 ldx [%sp+2223+24], i48 498 fdtox a48, a48 499 srlx %g4, 32, %o3 C (i32 >> 32) 500 add %l5, %l4, %o1 C hi64- in %o1 501 std a00, [%sp+2223+0] 502 sllx %g4, 16, %o2 C (i32 << 16) 503 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 504 std a16, [%sp+2223+8] 505 sllx %o1, 48, %o3 C (hi64 << 48) 506 add %g2, %o2, %o2 C mi64- in %o2 507 std a32, [%sp+2223+16] 508 add %l6, %o2, %o2 C mi64- in %o2 509 std a48, [%sp+2223+24] 510 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 511 stx %o5, [%i4+%i2] 512 add cy, %g5, %o4 C x = prev(i00) + cy 513 add %i2, 8, %i2 514.L_out_3: 515 srlx %o4, 16, %o5 C (x >> 16) 516 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 517 and %o4, xffff, %o5 C (x & 0xffff) 518 fdtox r64, a00 519 srlx rlimb, 32, %g4 C HI(rlimb) 520 and rlimb, xffffffff, %g5 C LO(rlimb) 521 ldx [%i0+%i2], rlimb C read rp[i] 522 srlx %o2, 48, %o7 C (mi64 >> 48) 523 add i00, %g5, %g5 C i00+ now in g5 524 fdtox r80, a16 525 sllx %o2, 16, %i3 C (mi64 << 16) 526 add %o7, %o1, cy C new cy 527 ldx [%sp+2223+0], i00 528 srlx i16, 48, %l4 C (i16 >> 48) 529 mov i16, %g2 530 ldx [%sp+2223+8], i16 531 srlx i48, 16, %l5 C (i48 >> 16) 532 add i32, %g4, %g4 C i32+ now in g4 533 ldx [%sp+2223+16], i32 534 sllx i48, 32, %l6 C (i48 << 32) 535 or %i3, %o5, %o5 536 ldx [%sp+2223+24], i48 537 srlx %g4, 32, %o3 C (i32 >> 32) 538 add %l5, %l4, %o1 C hi64- in %o1 539 std a00, [%sp+2223+0] 540 sllx %g4, 16, %o2 C (i32 << 16) 541 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 542 std a16, [%sp+2223+8] 543 sllx %o1, 48, %o3 C (hi64 << 48) 544 add %g2, %o2, %o2 C mi64- in %o2 545 add %l6, %o2, %o2 C mi64- in %o2 546 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 547 stx %o5, [%i4+%i2] 548 add cy, %g5, %o4 C x = prev(i00) + cy 549 add %i2, 8, %i2 550.L_out_2: 551 srlx %o4, 16, %o5 C (x >> 16) 552 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 553 and %o4, xffff, %o5 C (x & 0xffff) 554 srlx rlimb, 32, %g4 C HI(rlimb) 555 and rlimb, xffffffff, %g5 C LO(rlimb) 556 srlx %o2, 48, %o7 C (mi64 >> 48) 557 add i00, %g5, %g5 C i00+ now in g5 558 sllx %o2, 16, %i3 C (mi64 << 16) 559 add %o7, %o1, cy C new cy 560 ldx [%sp+2223+0], i00 561 srlx i16, 48, %l4 C (i16 >> 48) 562 mov i16, %g2 563 ldx [%sp+2223+8], i16 564 srlx i48, 16, %l5 C (i48 >> 16) 565 add i32, %g4, %g4 C i32+ now in g4 566 sllx i48, 32, %l6 C (i48 << 32) 567 or %i3, %o5, %o5 568 srlx %g4, 32, %o3 C (i32 >> 32) 569 add %l5, %l4, %o1 C hi64- in %o1 570 sllx %g4, 16, %o2 C (i32 << 16) 571 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 572 sllx %o1, 48, %o3 C (hi64 << 48) 573 add %g2, %o2, %o2 C mi64- in %o2 574 add %l6, %o2, %o2 C mi64- in %o2 575 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 576 stx %o5, [%i4+%i2] 577 add cy, %g5, %o4 C x = prev(i00) + cy 578 add %i2, 8, %i2 579.L_out_1: 580 srlx %o4, 16, %o5 C (x >> 16) 581 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 582 and %o4, xffff, %o5 C (x & 0xffff) 583 srlx %o2, 48, %o7 C (mi64 >> 48) 584 sllx %o2, 16, %i3 C (mi64 << 16) 585 add %o7, %o1, cy C new cy 586 or %i3, %o5, %o5 587 stx %o5, [%i4+%i2] 588 589 sllx i00, 0, %g2 590 add %g2, cy, cy 591 sllx i16, 16, %g3 592 add %g3, cy, cy 593 594 return %i7+8 595 mov cy, %o0 596EPILOGUE(mpn_addmul_1) 597