1dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store 2dnl the result in a second limb vector. 3 4dnl Copyright 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C UltraSPARC 1&2: 14 25C UltraSPARC 3: 18.5 26 27C Algorithm: We use eight floating-point multiplies per limb product, with the 28C invariant v operand split into four 16-bit pieces, and the s1 operand split 29C into 32-bit pieces. We sum pairs of 48-bit partial products using 30C floating-point add, then convert the four 49-bit product-sums and transfer 31C them to the integer unit. 32 33C Possible optimizations: 34C 1. Align the stack area where we transfer the four 49-bit product-sums 35C to a 32-byte boundary. That would minimize the cache collision. 36C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would 37C be to align the area to map to the area immediately before s1?) 38C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the 39C develop mpn_addmul_2. This would save many integer instructions. 40C 3. Unrolling. Questionable if it is worth the code expansion, given that 41C it could only save 1 cycle/limb. 42C 4. Specialize for particular v values. If its upper 32 bits are zero, we 43C could save many operations, in the FPU (fmuld), but more so in the IEU 44C since we'll be summing 48-bit quantities, which might be simpler. 45C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and 46C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should 47C not be greater than needed for L2 cache latency, and also not so great 48C that i16 needs to be copied. 49C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want 50C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU 51C ops.) 52 53C Instruction classification (as per UltraSPARC-1/2 functional units): 54C 8 FM 55C 10 FA 56C 11 MEM 57C 9 ISHIFT + 10? IADDLOG 58C 1 BRANCH 59C 49 insns totally (plus three mov insns that should be optimized out) 60 61C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we 62C sustain 3.79 instructions/cycle. 63 64C INPUT PARAMETERS 65C rp i0 66C up i1 67C n i2 68C v i3 69 70ASM_START() 71 REGISTER(%g2,#scratch) 72 REGISTER(%g3,#scratch) 73 74define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14') 75define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22') 76define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30') 77define(`u00',`%f32') define(`u32', `%f34') 78define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42') 79define(`cy',`%g1') 80define(`rlimb',`%g3') 81define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3') 82define(`xffffffff',`%l7') 83define(`xffff',`%o0') 84 85PROLOGUE(mpn_mul_1) 86 87C Initialization. (1) Split v operand into four 16-bit chunks and store them 88C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs 89C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'. 90 91 save %sp, -256, %sp 92 mov -1, %g4 93 srlx %g4, 48, xffff C store mask in register `xffff' 94 and %i3, xffff, %g2 95 stx %g2, [%sp+2223+0] 96 srlx %i3, 16, %g3 97 and %g3, xffff, %g3 98 stx %g3, [%sp+2223+8] 99 srlx %i3, 32, %g2 100 and %g2, xffff, %g2 101 stx %g2, [%sp+2223+16] 102 srlx %i3, 48, %g3 103 stx %g3, [%sp+2223+24] 104 srlx %g4, 32, xffffffff C store mask in register `xffffffff' 105 106 sllx %i2, 3, %i2 107 mov 0, cy C clear cy 108 add %i0, %i2, %i0 109 add %i1, %i2, %i1 110 neg %i2 111 add %i1, 4, %i5 112 add %i0, -32, %i4 113 add %i0, -16, %i0 114 115 ldd [%sp+2223+0], v00 116 ldd [%sp+2223+8], v16 117 ldd [%sp+2223+16], v32 118 ldd [%sp+2223+24], v48 119 ld [%sp+2223+0],%f2 C zero f2 120 ld [%sp+2223+0],%f4 C zero f4 121 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 122 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 123 fxtod v00, v00 124 fxtod v16, v16 125 fxtod v32, v32 126 fxtod v48, v48 127 128C Start real work. (We sneakingly read f3 and f5 above...) 129C The software pipeline is very deep, requiring 4 feed-in stages. 130 131 fxtod %f2, u00 132 fxtod %f4, u32 133 fmuld u00, v00, a00 134 fmuld u00, v16, a16 135 fmuld u00, v32, p32 136 fmuld u32, v00, r32 137 fmuld u00, v48, p48 138 addcc %i2, 8, %i2 139 bnz,pt %xcc, .L_two_or_more 140 fmuld u32, v16, r48 141 142.L_one: 143 fmuld u32, v32, r64 C FIXME not urgent 144 faddd p32, r32, a32 145 fdtox a00, a00 146 faddd p48, r48, a48 147 fmuld u32, v48, r80 C FIXME not urgent 148 fdtox a16, a16 149 fdtox a32, a32 150 fdtox a48, a48 151 std a00, [%sp+2223+0] 152 std a16, [%sp+2223+8] 153 std a32, [%sp+2223+16] 154 std a48, [%sp+2223+24] 155 add %i2, 8, %i2 156 157 fdtox r64, a00 158 fdtox r80, a16 159 ldx [%sp+2223+0], i00 160 ldx [%sp+2223+8], i16 161 ldx [%sp+2223+16], i32 162 ldx [%sp+2223+24], i48 163 std a00, [%sp+2223+0] 164 std a16, [%sp+2223+8] 165 add %i2, 8, %i2 166 167 mov i00, %g5 C i00+ now in g5 168 ldx [%sp+2223+0], i00 169 srlx i16, 48, %l4 C (i16 >> 48) 170 mov i16, %g2 171 ldx [%sp+2223+8], i16 172 srlx i48, 16, %l5 C (i48 >> 16) 173 mov i32, %g4 C i32+ now in g4 174 sllx i48, 32, %l6 C (i48 << 32) 175 srlx %g4, 32, %o3 C (i32 >> 32) 176 add %l5, %l4, %o1 C hi64- in %o1 177 std a00, [%sp+2223+0] 178 sllx %g4, 16, %o2 C (i32 << 16) 179 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 180 std a16, [%sp+2223+8] 181 sllx %o1, 48, %o3 C (hi64 << 48) 182 add %g2, %o2, %o2 C mi64- in %o2 183 add %l6, %o2, %o2 C mi64- in %o2 184 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 185 add cy, %g5, %o4 C x = prev(i00) + cy 186 b .L_out_1 187 add %i2, 8, %i2 188 189.L_two_or_more: 190 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 191 fmuld u32, v32, r64 C FIXME not urgent 192 faddd p32, r32, a32 193 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 194 fdtox a00, a00 195 faddd p48, r48, a48 196 fmuld u32, v48, r80 C FIXME not urgent 197 fdtox a16, a16 198 fdtox a32, a32 199 fxtod %f2, u00 200 fxtod %f4, u32 201 fdtox a48, a48 202 std a00, [%sp+2223+0] 203 fmuld u00, v00, p00 204 std a16, [%sp+2223+8] 205 fmuld u00, v16, p16 206 std a32, [%sp+2223+16] 207 fmuld u00, v32, p32 208 std a48, [%sp+2223+24] 209 faddd p00, r64, a00 210 fmuld u32, v00, r32 211 faddd p16, r80, a16 212 fmuld u00, v48, p48 213 addcc %i2, 8, %i2 214 bnz,pt %xcc, .L_three_or_more 215 fmuld u32, v16, r48 216 217.L_two: 218 fmuld u32, v32, r64 C FIXME not urgent 219 faddd p32, r32, a32 220 fdtox a00, a00 221 faddd p48, r48, a48 222 fmuld u32, v48, r80 C FIXME not urgent 223 fdtox a16, a16 224 ldx [%sp+2223+0], i00 225 fdtox a32, a32 226 ldx [%sp+2223+8], i16 227 ldx [%sp+2223+16], i32 228 ldx [%sp+2223+24], i48 229 fdtox a48, a48 230 std a00, [%sp+2223+0] 231 std a16, [%sp+2223+8] 232 std a32, [%sp+2223+16] 233 std a48, [%sp+2223+24] 234 add %i2, 8, %i2 235 236 fdtox r64, a00 237 mov i00, %g5 C i00+ now in g5 238 fdtox r80, a16 239 ldx [%sp+2223+0], i00 240 srlx i16, 48, %l4 C (i16 >> 48) 241 mov i16, %g2 242 ldx [%sp+2223+8], i16 243 srlx i48, 16, %l5 C (i48 >> 16) 244 mov i32, %g4 C i32+ now in g4 245 ldx [%sp+2223+16], i32 246 sllx i48, 32, %l6 C (i48 << 32) 247 ldx [%sp+2223+24], i48 248 srlx %g4, 32, %o3 C (i32 >> 32) 249 add %l5, %l4, %o1 C hi64- in %o1 250 std a00, [%sp+2223+0] 251 sllx %g4, 16, %o2 C (i32 << 16) 252 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 253 std a16, [%sp+2223+8] 254 sllx %o1, 48, %o3 C (hi64 << 48) 255 add %g2, %o2, %o2 C mi64- in %o2 256 add %l6, %o2, %o2 C mi64- in %o2 257 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 258 add cy, %g5, %o4 C x = prev(i00) + cy 259 b .L_out_2 260 add %i2, 8, %i2 261 262.L_three_or_more: 263 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 264 fmuld u32, v32, r64 C FIXME not urgent 265 faddd p32, r32, a32 266 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 267 fdtox a00, a00 268 faddd p48, r48, a48 269 fmuld u32, v48, r80 C FIXME not urgent 270 fdtox a16, a16 271 ldx [%sp+2223+0], i00 272 fdtox a32, a32 273 ldx [%sp+2223+8], i16 274 fxtod %f2, u00 275 ldx [%sp+2223+16], i32 276 fxtod %f4, u32 277 ldx [%sp+2223+24], i48 278 fdtox a48, a48 279 std a00, [%sp+2223+0] 280 fmuld u00, v00, p00 281 std a16, [%sp+2223+8] 282 fmuld u00, v16, p16 283 std a32, [%sp+2223+16] 284 fmuld u00, v32, p32 285 std a48, [%sp+2223+24] 286 faddd p00, r64, a00 287 fmuld u32, v00, r32 288 faddd p16, r80, a16 289 fmuld u00, v48, p48 290 addcc %i2, 8, %i2 291 bnz,pt %xcc, .L_four_or_more 292 fmuld u32, v16, r48 293 294.L_three: 295 fmuld u32, v32, r64 C FIXME not urgent 296 faddd p32, r32, a32 297 fdtox a00, a00 298 faddd p48, r48, a48 299 mov i00, %g5 C i00+ now in g5 300 fmuld u32, v48, r80 C FIXME not urgent 301 fdtox a16, a16 302 ldx [%sp+2223+0], i00 303 fdtox a32, a32 304 srlx i16, 48, %l4 C (i16 >> 48) 305 mov i16, %g2 306 ldx [%sp+2223+8], i16 307 srlx i48, 16, %l5 C (i48 >> 16) 308 mov i32, %g4 C i32+ now in g4 309 ldx [%sp+2223+16], i32 310 sllx i48, 32, %l6 C (i48 << 32) 311 ldx [%sp+2223+24], i48 312 fdtox a48, a48 313 srlx %g4, 32, %o3 C (i32 >> 32) 314 add %l5, %l4, %o1 C hi64- in %o1 315 std a00, [%sp+2223+0] 316 sllx %g4, 16, %o2 C (i32 << 16) 317 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 318 std a16, [%sp+2223+8] 319 sllx %o1, 48, %o3 C (hi64 << 48) 320 add %g2, %o2, %o2 C mi64- in %o2 321 std a32, [%sp+2223+16] 322 add %l6, %o2, %o2 C mi64- in %o2 323 std a48, [%sp+2223+24] 324 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 325 add cy, %g5, %o4 C x = prev(i00) + cy 326 b .L_out_3 327 add %i2, 8, %i2 328 329.L_four_or_more: 330 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 331 fmuld u32, v32, r64 C FIXME not urgent 332 faddd p32, r32, a32 333 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 334 fdtox a00, a00 335 faddd p48, r48, a48 336 mov i00, %g5 C i00+ now in g5 337 fmuld u32, v48, r80 C FIXME not urgent 338 fdtox a16, a16 339 ldx [%sp+2223+0], i00 340 fdtox a32, a32 341 srlx i16, 48, %l4 C (i16 >> 48) 342 mov i16, %g2 343 ldx [%sp+2223+8], i16 344 fxtod %f2, u00 345 srlx i48, 16, %l5 C (i48 >> 16) 346 mov i32, %g4 C i32+ now in g4 347 ldx [%sp+2223+16], i32 348 fxtod %f4, u32 349 sllx i48, 32, %l6 C (i48 << 32) 350 ldx [%sp+2223+24], i48 351 fdtox a48, a48 352 srlx %g4, 32, %o3 C (i32 >> 32) 353 add %l5, %l4, %o1 C hi64- in %o1 354 std a00, [%sp+2223+0] 355 fmuld u00, v00, p00 356 sllx %g4, 16, %o2 C (i32 << 16) 357 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 358 std a16, [%sp+2223+8] 359 fmuld u00, v16, p16 360 sllx %o1, 48, %o3 C (hi64 << 48) 361 add %g2, %o2, %o2 C mi64- in %o2 362 std a32, [%sp+2223+16] 363 fmuld u00, v32, p32 364 add %l6, %o2, %o2 C mi64- in %o2 365 std a48, [%sp+2223+24] 366 faddd p00, r64, a00 367 fmuld u32, v00, r32 368 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 369 faddd p16, r80, a16 370 fmuld u00, v48, p48 371 add cy, %g5, %o4 C x = prev(i00) + cy 372 addcc %i2, 8, %i2 373 bnz,pt %xcc, .Loop 374 fmuld u32, v16, r48 375 376.L_four: 377 b,a .L_out_4 378 379C BEGIN MAIN LOOP 380 .align 16 381.Loop: 382C 00 383 srlx %o4, 16, %o5 C (x >> 16) 384 ld [%i5+%i2], %f3 C read low 32 bits of up[i] 385 fmuld u32, v32, r64 C FIXME not urgent 386 faddd p32, r32, a32 387C 01 388 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 389 and %o4, xffff, %o5 C (x & 0xffff) 390 ld [%i1+%i2], %f5 C read high 32 bits of up[i] 391 fdtox a00, a00 392C 02 393 faddd p48, r48, a48 394C 03 395 srlx %o2, 48, %o7 C (mi64 >> 48) 396 mov i00, %g5 C i00+ now in g5 397 fmuld u32, v48, r80 C FIXME not urgent 398 fdtox a16, a16 399C 04 400 sllx %o2, 16, %i3 C (mi64 << 16) 401 add %o7, %o1, cy C new cy 402 ldx [%sp+2223+0], i00 403 fdtox a32, a32 404C 05 405 srlx i16, 48, %l4 C (i16 >> 48) 406 mov i16, %g2 407 ldx [%sp+2223+8], i16 408 fxtod %f2, u00 409C 06 410 srlx i48, 16, %l5 C (i48 >> 16) 411 mov i32, %g4 C i32+ now in g4 412 ldx [%sp+2223+16], i32 413 fxtod %f4, u32 414C 07 415 sllx i48, 32, %l6 C (i48 << 32) 416 or %i3, %o5, %o5 417 ldx [%sp+2223+24], i48 418 fdtox a48, a48 419C 08 420 srlx %g4, 32, %o3 C (i32 >> 32) 421 add %l5, %l4, %o1 C hi64- in %o1 422 std a00, [%sp+2223+0] 423 fmuld u00, v00, p00 424C 09 425 sllx %g4, 16, %o2 C (i32 << 16) 426 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 427 std a16, [%sp+2223+8] 428 fmuld u00, v16, p16 429C 10 430 sllx %o1, 48, %o3 C (hi64 << 48) 431 add %g2, %o2, %o2 C mi64- in %o2 432 std a32, [%sp+2223+16] 433 fmuld u00, v32, p32 434C 11 435 add %l6, %o2, %o2 C mi64- in %o2 436 std a48, [%sp+2223+24] 437 faddd p00, r64, a00 438 fmuld u32, v00, r32 439C 12 440 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 441 stx %o5, [%i4+%i2] 442 faddd p16, r80, a16 443 fmuld u00, v48, p48 444C 13 445 add cy, %g5, %o4 C x = prev(i00) + cy 446 addcc %i2, 8, %i2 447 bnz,pt %xcc, .Loop 448 fmuld u32, v16, r48 449C END MAIN LOOP 450 451.L_out_4: 452 srlx %o4, 16, %o5 C (x >> 16) 453 fmuld u32, v32, r64 C FIXME not urgent 454 faddd p32, r32, a32 455 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 456 and %o4, xffff, %o5 C (x & 0xffff) 457 fdtox a00, a00 458 faddd p48, r48, a48 459 srlx %o2, 48, %o7 C (mi64 >> 48) 460 mov i00, %g5 C i00+ now in g5 461 fmuld u32, v48, r80 C FIXME not urgent 462 fdtox a16, a16 463 sllx %o2, 16, %i3 C (mi64 << 16) 464 add %o7, %o1, cy C new cy 465 ldx [%sp+2223+0], i00 466 fdtox a32, a32 467 srlx i16, 48, %l4 C (i16 >> 48) 468 mov i16, %g2 469 ldx [%sp+2223+8], i16 470 srlx i48, 16, %l5 C (i48 >> 16) 471 mov i32, %g4 C i32+ now in g4 472 ldx [%sp+2223+16], i32 473 sllx i48, 32, %l6 C (i48 << 32) 474 or %i3, %o5, %o5 475 ldx [%sp+2223+24], i48 476 fdtox a48, a48 477 srlx %g4, 32, %o3 C (i32 >> 32) 478 add %l5, %l4, %o1 C hi64- in %o1 479 std a00, [%sp+2223+0] 480 sllx %g4, 16, %o2 C (i32 << 16) 481 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 482 std a16, [%sp+2223+8] 483 sllx %o1, 48, %o3 C (hi64 << 48) 484 add %g2, %o2, %o2 C mi64- in %o2 485 std a32, [%sp+2223+16] 486 add %l6, %o2, %o2 C mi64- in %o2 487 std a48, [%sp+2223+24] 488 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 489 stx %o5, [%i4+%i2] 490 add cy, %g5, %o4 C x = prev(i00) + cy 491 add %i2, 8, %i2 492.L_out_3: 493 srlx %o4, 16, %o5 C (x >> 16) 494 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 495 and %o4, xffff, %o5 C (x & 0xffff) 496 fdtox r64, a00 497 srlx %o2, 48, %o7 C (mi64 >> 48) 498 mov i00, %g5 C i00+ now in g5 499 fdtox r80, a16 500 sllx %o2, 16, %i3 C (mi64 << 16) 501 add %o7, %o1, cy C new cy 502 ldx [%sp+2223+0], i00 503 srlx i16, 48, %l4 C (i16 >> 48) 504 mov i16, %g2 505 ldx [%sp+2223+8], i16 506 srlx i48, 16, %l5 C (i48 >> 16) 507 mov i32, %g4 C i32+ now in g4 508 ldx [%sp+2223+16], i32 509 sllx i48, 32, %l6 C (i48 << 32) 510 or %i3, %o5, %o5 511 ldx [%sp+2223+24], i48 512 srlx %g4, 32, %o3 C (i32 >> 32) 513 add %l5, %l4, %o1 C hi64- in %o1 514 std a00, [%sp+2223+0] 515 sllx %g4, 16, %o2 C (i32 << 16) 516 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 517 std a16, [%sp+2223+8] 518 sllx %o1, 48, %o3 C (hi64 << 48) 519 add %g2, %o2, %o2 C mi64- in %o2 520 add %l6, %o2, %o2 C mi64- in %o2 521 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 522 stx %o5, [%i4+%i2] 523 add cy, %g5, %o4 C x = prev(i00) + cy 524 add %i2, 8, %i2 525.L_out_2: 526 srlx %o4, 16, %o5 C (x >> 16) 527 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 528 and %o4, xffff, %o5 C (x & 0xffff) 529 srlx %o2, 48, %o7 C (mi64 >> 48) 530 mov i00, %g5 C i00+ now in g5 531 sllx %o2, 16, %i3 C (mi64 << 16) 532 add %o7, %o1, cy C new cy 533 ldx [%sp+2223+0], i00 534 srlx i16, 48, %l4 C (i16 >> 48) 535 mov i16, %g2 536 ldx [%sp+2223+8], i16 537 srlx i48, 16, %l5 C (i48 >> 16) 538 mov i32, %g4 C i32+ now in g4 539 sllx i48, 32, %l6 C (i48 << 32) 540 or %i3, %o5, %o5 541 srlx %g4, 32, %o3 C (i32 >> 32) 542 add %l5, %l4, %o1 C hi64- in %o1 543 sllx %g4, 16, %o2 C (i32 << 16) 544 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT 545 sllx %o1, 48, %o3 C (hi64 << 48) 546 add %g2, %o2, %o2 C mi64- in %o2 547 add %l6, %o2, %o2 C mi64- in %o2 548 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT 549 stx %o5, [%i4+%i2] 550 add cy, %g5, %o4 C x = prev(i00) + cy 551 add %i2, 8, %i2 552.L_out_1: 553 srlx %o4, 16, %o5 C (x >> 16) 554 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT 555 and %o4, xffff, %o5 C (x & 0xffff) 556 srlx %o2, 48, %o7 C (mi64 >> 48) 557 sllx %o2, 16, %i3 C (mi64 << 16) 558 add %o7, %o1, cy C new cy 559 or %i3, %o5, %o5 560 stx %o5, [%i4+%i2] 561 562 sllx i00, 0, %g2 563 add %g2, cy, cy 564 sllx i16, 16, %g3 565 add %g3, cy, cy 566 567 return %i7+8 568 mov cy, %o0 569EPILOGUE(mpn_mul_1) 570