1dnl Alpha ev6 mpn_mul_1 -- Multiply a limb vector with a limb and store the 2dnl result in a second limb vector. 3 4dnl Copyright 2000, 2001, 2005 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C INPUT PARAMETERS 24C res_ptr r16 25C s1_ptr r17 26C size r18 27C s2_limb r19 28 29C This code runs at 2.25 cycles/limb on EV6. 30 31C This code was written in close cooperation with ev6 pipeline expert 32C Steve Root. Any errors are tege's fault, though. 33 34C Code structure: 35 36C code for n < 8 37C code for n > 8 code for (n mod 8) 38C code for (n div 8) feed-in code 39C 8-way unrolled loop 40C wind-down code 41 42C Some notes about unrolled loop: 43C 44C r1-r8 multiplies and workup 45C r21-r28 multiplies and workup 46C r9-r12 loads 47C r0 -1 48C r20,r29,r13-r15 scramble 49C 50C We're doing 7 of the 8 carry propagations with a br fixup code and 1 with a 51C put-the-carry-into-hi. The idea is that these branches are very rarely 52C taken, and since a non-taken branch consumes no resources, that is better 53C than an addq. 54C 55C Software pipeline: a load in cycle #09, feeds a mul in cycle #16, feeds an 56C add NEXT cycle #09 which feeds a store in NEXT cycle #02 57 58C The code could use some further work: 59C 1. Speed up really small multiplies. The default alpha/mul_1.asm code is 60C faster than this for size < 3. 61C 2. Improve feed-in code, perhaps with the equivalent of switch(n%8) unless 62C that is too costly. 63C 3. Consider using 4-way unrolling, even if that runs slower. 64C 4. Reduce register usage. In particular, try to avoid using r29. 65 66ASM_START() 67PROLOGUE(mpn_mul_1) 68 cmpult r18, 8, r1 69 beq r1, $Large 70$Lsmall: 71 ldq r2,0(r17) C r2 = s1_limb 72 lda r18,-1(r18) C size-- 73 mulq r2,r19,r3 C r3 = prod_low 74 bic r31,r31,r4 C clear cy_limb 75 umulh r2,r19,r0 C r0 = prod_high 76 beq r18,$Le1a C jump if size was == 1 77 ldq r2,8(r17) C r2 = s1_limb 78 lda r18,-1(r18) C size-- 79 stq r3,0(r16) 80 beq r18,$Le2a C jump if size was == 2 81 ALIGN(8) 82$Lopa: mulq r2,r19,r3 C r3 = prod_low 83 addq r4,r0,r0 C cy_limb = cy_limb + 'cy' 84 lda r18,-1(r18) C size-- 85 umulh r2,r19,r4 C r4 = cy_limb 86 ldq r2,16(r17) C r2 = s1_limb 87 lda r17,8(r17) C s1_ptr++ 88 addq r3,r0,r3 C r3 = cy_limb + prod_low 89 stq r3,8(r16) 90 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 91 lda r16,8(r16) C res_ptr++ 92 bne r18,$Lopa 93 94$Le2a: mulq r2,r19,r3 C r3 = prod_low 95 addq r4,r0,r0 C cy_limb = cy_limb + 'cy' 96 umulh r2,r19,r4 C r4 = cy_limb 97 addq r3,r0,r3 C r3 = cy_limb + prod_low 98 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 99 stq r3,8(r16) 100 addq r4,r0,r0 C cy_limb = prod_high + cy 101 ret r31,(r26),1 102$Le1a: stq r3,0(r16) 103 ret r31,(r26),1 104 105$Large: 106 lda r30, -224(r30) 107 stq r26, 0(r30) 108 stq r9, 8(r30) 109 stq r10, 16(r30) 110 stq r11, 24(r30) 111 stq r12, 32(r30) 112 stq r13, 40(r30) 113 stq r14, 48(r30) 114 stq r15, 56(r30) 115 stq r29, 64(r30) 116 117 and r18, 7, r20 C count for the first loop, 0-7 118 srl r18, 3, r18 C count for unrolled loop 119 bis r31, r31, r21 120 beq r20, $L_8_or_more C skip first loop 121 122$L_9_or_more: 123 ldq r2,0(r17) C r2 = s1_limb 124 lda r17,8(r17) C s1_ptr++ 125 lda r20,-1(r20) C size-- 126 mulq r2,r19,r3 C r3 = prod_low 127 umulh r2,r19,r21 C r21 = prod_high 128 beq r20,$Le1b C jump if size was == 1 129 bis r31, r31, r0 C FIXME: shouldn't need this 130 ldq r2,0(r17) C r2 = s1_limb 131 lda r17,8(r17) C s1_ptr++ 132 lda r20,-1(r20) C size-- 133 stq r3,0(r16) 134 lda r16,8(r16) C res_ptr++ 135 beq r20,$Le2b C jump if size was == 2 136 ALIGN(8) 137$Lopb: mulq r2,r19,r3 C r3 = prod_low 138 addq r21,r0,r0 C cy_limb = cy_limb + 'cy' 139 lda r20,-1(r20) C size-- 140 umulh r2,r19,r21 C r21 = prod_high 141 ldq r2,0(r17) C r2 = s1_limb 142 lda r17,8(r17) C s1_ptr++ 143 addq r3,r0,r3 C r3 = cy_limb + prod_low 144 stq r3,0(r16) 145 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 146 lda r16,8(r16) C res_ptr++ 147 bne r20,$Lopb 148 149$Le2b: mulq r2,r19,r3 C r3 = prod_low 150 addq r21,r0,r0 C cy_limb = cy_limb + 'cy' 151 umulh r2,r19,r21 C r21 = prod_high 152 addq r3,r0,r3 C r3 = cy_limb + prod_low 153 cmpult r3,r0,r0 C r0 = carry from (cy_limb + prod_low) 154 stq r3,0(r16) 155 lda r16,8(r16) C res_ptr++ 156 addq r21,r0,r21 C cy_limb = prod_high + cy 157 br r31, $L_8_or_more 158$Le1b: stq r3,0(r16) 159 lda r16,8(r16) C res_ptr++ 160 161$L_8_or_more: 162 lda r0, -1(r31) C put -1 in r0, for tricky loop control 163 lda r17, -32(r17) C L1 bookkeeping 164 lda r18, -1(r18) C decrement count 165 166 ldq r9, 32(r17) C L1 167 ldq r10, 40(r17) C L1 168 mulq r9, r19, r22 C U1 #07 169 ldq r11, 48(r17) C L1 170 umulh r9, r19, r23 C U1 #08 171 ldq r12, 56(r17) C L1 172 mulq r10, r19, r24 C U1 #09 173 ldq r9, 64(r17) C L1 174 175 lda r17, 64(r17) C L1 bookkeeping 176 177 umulh r10, r19, r25 C U1 #11 178 mulq r11, r19, r26 C U1 #12 179 umulh r11, r19, r27 C U1 #13 180 mulq r12, r19, r28 C U1 #14 181 ldq r10, 8(r17) C L1 182 umulh r12, r19, r1 C U1 #15 183 ldq r11, 16(r17) C L1 184 mulq r9, r19, r2 C U1 #16 185 ldq r12, 24(r17) C L1 186 umulh r9, r19, r3 C U1 #17 187 addq r21, r22, r13 C L1 mov 188 mulq r10, r19, r4 C U1 #18 189 addq r23, r24, r22 C L0 sum 2 mul's 190 cmpult r13, r21, r14 C L1 carry from sum 191 bgt r18, $L_16_or_more 192 193 cmpult r22, r24, r24 C U0 carry from sum 194 umulh r10, r19, r5 C U1 #02 195 addq r25, r26, r23 C U0 sum 2 mul's 196 mulq r11, r19, r6 C U1 #03 197 cmpult r23, r26, r25 C U0 carry from sum 198 umulh r11, r19, r7 C U1 #04 199 addq r27, r28, r28 C U0 sum 2 mul's 200 mulq r12, r19, r8 C U1 #05 201 cmpult r28, r27, r15 C L0 carry from sum 202 lda r16, 32(r16) C L1 bookkeeping 203 addq r13, r31, r13 C U0 start carry cascade 204 umulh r12, r19, r21 C U1 #06 205 br r31, $ret0c 206 207$L_16_or_more: 208C --------------------------------------------------------------- 209 subq r18,1,r18 210 cmpult r22, r24, r24 C U0 carry from sum 211 ldq r9, 32(r17) C L1 212 213 umulh r10, r19, r5 C U1 #02 214 addq r25, r26, r23 C U0 sum 2 mul's 215 mulq r11, r19, r6 C U1 #03 216 cmpult r23, r26, r25 C U0 carry from sum 217 umulh r11, r19, r7 C U1 #04 218 addq r27, r28, r28 C U0 sum 2 mul's 219 mulq r12, r19, r8 C U1 #05 220 cmpult r28, r27, r15 C L0 carry from sum 221 lda r16, 32(r16) C L1 bookkeeping 222 addq r13, r31, r13 C U0 start carry cascade 223 224 umulh r12, r19, r21 C U1 #06 225C beq r13, $fix0w C U0 226$ret0w: addq r22, r14, r26 C L0 227 ldq r10, 40(r17) C L1 228 229 mulq r9, r19, r22 C U1 #07 230 beq r26, $fix1w C U0 231$ret1w: addq r23, r24, r27 C L0 232 ldq r11, 48(r17) C L1 233 234 umulh r9, r19, r23 C U1 #08 235 beq r27, $fix2w C U0 236$ret2w: addq r28, r25, r28 C L0 237 ldq r12, 56(r17) C L1 238 239 mulq r10, r19, r24 C U1 #09 240 beq r28, $fix3w C U0 241$ret3w: addq r1, r2, r20 C L0 sum 2 mul's 242 ldq r9, 64(r17) C L1 243 244 addq r3, r4, r2 C L0 #10 2 mul's 245 lda r17, 64(r17) C L1 bookkeeping 246 cmpult r20, r1, r29 C U0 carry from sum 247 248 umulh r10, r19, r25 C U1 #11 249 cmpult r2, r4, r4 C U0 carry from sum 250 stq r13, -32(r16) C L0 251 stq r26, -24(r16) C L1 252 253 mulq r11, r19, r26 C U1 #12 254 addq r5, r6, r14 C U0 sum 2 mul's 255 stq r27, -16(r16) C L0 256 stq r28, -8(r16) C L1 257 258 umulh r11, r19, r27 C U1 #13 259 cmpult r14, r6, r3 C U0 carry from sum 260C could do cross-jumping here: 261C bra $L_middle_of_unrolled_loop 262 mulq r12, r19, r28 C U1 #14 263 addq r7, r3, r5 C L0 eat carry 264 addq r20, r15, r20 C U0 carry cascade 265 ldq r10, 8(r17) C L1 266 267 umulh r12, r19, r1 C U1 #15 268 beq r20, $fix4 C U0 269$ret4w: addq r2, r29, r6 C L0 270 ldq r11, 16(r17) C L1 271 272 mulq r9, r19, r2 C U1 #16 273 beq r6, $fix5 C U0 274$ret5w: addq r14, r4, r7 C L0 275 ldq r12, 24(r17) C L1 276 277 umulh r9, r19, r3 C U1 #17 278 beq r7, $fix6 C U0 279$ret6w: addq r5, r8, r8 C L0 sum 2 280 addq r21, r22, r13 C L1 sum 2 mul's 281 282 mulq r10, r19, r4 C U1 #18 283 addq r23, r24, r22 C L0 sum 2 mul's 284 cmpult r13, r21, r14 C L1 carry from sum 285 ble r18, $Lend C U0 286C --------------------------------------------------------------- 287 ALIGN(16) 288$Loop: 289 umulh r0, r18, r18 C U1 #01 decrement r18! 290 cmpult r8, r5, r29 C L0 carry from last bunch 291 cmpult r22, r24, r24 C U0 carry from sum 292 ldq r9, 32(r17) C L1 293 294 umulh r10, r19, r5 C U1 #02 295 addq r25, r26, r23 C U0 sum 2 mul's 296 stq r20, 0(r16) C L0 297 stq r6, 8(r16) C L1 298 299 mulq r11, r19, r6 C U1 #03 300 cmpult r23, r26, r25 C U0 carry from sum 301 stq r7, 16(r16) C L0 302 stq r8, 24(r16) C L1 303 304 umulh r11, r19, r7 C U1 #04 305 bis r31, r31, r31 C L0 st slosh 306 bis r31, r31, r31 C L1 st slosh 307 addq r27, r28, r28 C U0 sum 2 mul's 308 309 mulq r12, r19, r8 C U1 #05 310 cmpult r28, r27, r15 C L0 carry from sum 311 lda r16, 64(r16) C L1 bookkeeping 312 addq r13, r29, r13 C U0 start carry cascade 313 314 umulh r12, r19, r21 C U1 #06 315 beq r13, $fix0 C U0 316$ret0: addq r22, r14, r26 C L0 317 ldq r10, 40(r17) C L1 318 319 mulq r9, r19, r22 C U1 #07 320 beq r26, $fix1 C U0 321$ret1: addq r23, r24, r27 C L0 322 ldq r11, 48(r17) C L1 323 324 umulh r9, r19, r23 C U1 #08 325 beq r27, $fix2 C U0 326$ret2: addq r28, r25, r28 C L0 327 ldq r12, 56(r17) C L1 328 329 mulq r10, r19, r24 C U1 #09 330 beq r28, $fix3 C U0 331$ret3: addq r1, r2, r20 C L0 sum 2 mul's 332 ldq r9, 64(r17) C L1 333 334 addq r3, r4, r2 C L0 #10 2 mul's 335 bis r31, r31, r31 C U1 mul hole 336 lda r17, 64(r17) C L1 bookkeeping 337 cmpult r20, r1, r29 C U0 carry from sum 338 339 umulh r10, r19, r25 C U1 #11 340 cmpult r2, r4, r4 C U0 carry from sum 341 stq r13, -32(r16) C L0 342 stq r26, -24(r16) C L1 343 344 mulq r11, r19, r26 C U1 #12 345 addq r5, r6, r14 C U0 sum 2 mul's 346 stq r27, -16(r16) C L0 347 stq r28, -8(r16) C L1 348 349 umulh r11, r19, r27 C U1 #13 350 bis r31, r31, r31 C L0 st slosh 351 bis r31, r31, r31 C L1 st slosh 352 cmpult r14, r6, r3 C U0 carry from sum 353$L_middle_of_unrolled_loop: 354 mulq r12, r19, r28 C U1 #14 355 addq r7, r3, r5 C L0 eat carry 356 addq r20, r15, r20 C U0 carry cascade 357 ldq r10, 8(r17) C L1 358 359 umulh r12, r19, r1 C U1 #15 360 beq r20, $fix4 C U0 361$ret4: addq r2, r29, r6 C L0 362 ldq r11, 16(r17) C L1 363 364 mulq r9, r19, r2 C U1 #16 365 beq r6, $fix5 C U0 366$ret5: addq r14, r4, r7 C L0 367 ldq r12, 24(r17) C L1 368 369 umulh r9, r19, r3 C U1 #17 370 beq r7, $fix6 C U0 371$ret6: addq r5, r8, r8 C L0 sum 2 372 addq r21, r22, r13 C L1 sum 2 mul's 373 374 mulq r10, r19, r4 C U1 #18 375 addq r23, r24, r22 C L0 sum 2 mul's 376 cmpult r13, r21, r14 C L1 carry from sum 377 bgt r18, $Loop C U0 378C --------------------------------------------------------------- 379$Lend: 380 cmpult r8, r5, r29 C L0 carry from last bunch 381 cmpult r22, r24, r24 C U0 carry from sum 382 383 umulh r10, r19, r5 C U1 #02 384 addq r25, r26, r23 C U0 sum 2 mul's 385 stq r20, 0(r16) C L0 386 stq r6, 8(r16) C L1 387 388 mulq r11, r19, r6 C U1 #03 389 cmpult r23, r26, r25 C U0 carry from sum 390 stq r7, 16(r16) C L0 391 stq r8, 24(r16) C L1 392 393 umulh r11, r19, r7 C U1 #04 394 addq r27, r28, r28 C U0 sum 2 mul's 395 396 mulq r12, r19, r8 C U1 #05 397 cmpult r28, r27, r15 C L0 carry from sum 398 lda r16, 64(r16) C L1 bookkeeping 399 addq r13, r29, r13 C U0 start carry cascade 400 401 umulh r12, r19, r21 C U1 #06 402 beq r13, $fix0c C U0 403$ret0c: addq r22, r14, r26 C L0 404 beq r26, $fix1c C U0 405$ret1c: addq r23, r24, r27 C L0 406 beq r27, $fix2c C U0 407$ret2c: addq r28, r25, r28 C L0 408 beq r28, $fix3c C U0 409$ret3c: addq r1, r2, r20 C L0 sum 2 mul's 410 addq r3, r4, r2 C L0 #10 2 mul's 411 lda r17, 64(r17) C L1 bookkeeping 412 cmpult r20, r1, r29 C U0 carry from sum 413 cmpult r2, r4, r4 C U0 carry from sum 414 stq r13, -32(r16) C L0 415 stq r26, -24(r16) C L1 416 addq r5, r6, r14 C U0 sum 2 mul's 417 stq r27, -16(r16) C L0 418 stq r28, -8(r16) C L1 419 cmpult r14, r6, r3 C U0 carry from sum 420 addq r7, r3, r5 C L0 eat carry 421 addq r20, r15, r20 C U0 carry cascade 422 beq r20, $fix4c C U0 423$ret4c: addq r2, r29, r6 C L0 424 beq r6, $fix5c C U0 425$ret5c: addq r14, r4, r7 C L0 426 beq r7, $fix6c C U0 427$ret6c: addq r5, r8, r8 C L0 sum 2 428 cmpult r8, r5, r29 C L0 carry from last bunch 429 stq r20, 0(r16) C L0 430 stq r6, 8(r16) C L1 431 stq r7, 16(r16) C L0 432 stq r8, 24(r16) C L1 433 addq r29, r21, r0 434 435 ldq r26, 0(r30) 436 ldq r9, 8(r30) 437 ldq r10, 16(r30) 438 ldq r11, 24(r30) 439 ldq r12, 32(r30) 440 ldq r13, 40(r30) 441 ldq r14, 48(r30) 442 ldq r15, 56(r30) 443 ldq r29, 64(r30) 444 lda r30, 224(r30) 445 ret r31, (r26), 1 446 447C $fix0w: bis r14, r29, r14 C join carries 448C br r31, $ret0w 449$fix1w: bis r24, r14, r24 C join carries 450 br r31, $ret1w 451$fix2w: bis r25, r24, r25 C join carries 452 br r31, $ret2w 453$fix3w: bis r15, r25, r15 C join carries 454 br r31, $ret3w 455$fix0: bis r14, r29, r14 C join carries 456 br r31, $ret0 457$fix1: bis r24, r14, r24 C join carries 458 br r31, $ret1 459$fix2: bis r25, r24, r25 C join carries 460 br r31, $ret2 461$fix3: bis r15, r25, r15 C join carries 462 br r31, $ret3 463$fix4: bis r29, r15, r29 C join carries 464 br r31, $ret4 465$fix5: bis r4, r29, r4 C join carries 466 br r31, $ret5 467$fix6: addq r5, r4, r5 C can't carry twice! 468 br r31, $ret6 469$fix0c: bis r14, r29, r14 C join carries 470 br r31, $ret0c 471$fix1c: bis r24, r14, r24 C join carries 472 br r31, $ret1c 473$fix2c: bis r25, r24, r25 C join carries 474 br r31, $ret2c 475$fix3c: bis r15, r25, r15 C join carries 476 br r31, $ret3c 477$fix4c: bis r29, r15, r29 C join carries 478 br r31, $ret4c 479$fix5c: bis r4, r29, r4 C join carries 480 br r31, $ret5c 481$fix6c: addq r5, r4, r5 C can't carry twice! 482 br r31, $ret6c 483 484EPILOGUE(mpn_mul_1) 485ASM_END() 486