1dnl Alpha ev6 nails mpn_submul_1. 2 3dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C EV4: 42 24C EV5: 18 25C EV6: 4 26 27C TODO 28C * Reroll loop for 3.75 c/l with current 4-way unrolling. 29C * The loop is overscheduled wrt loads and wrt multiplies, in particular 30C umulh. 31C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 32C and would work since the loop structure is really regular. 33 34C INPUT PARAMETERS 35define(`rp',`r16') 36define(`up',`r17') 37define(`n', `r18') 38define(`vl0',`r19') 39 40define(`numb_mask',`r6') 41 42define(`m0a',`r0') 43define(`m0b',`r1') 44define(`m1a',`r2') 45define(`m1b',`r3') 46define(`m2a',`r20') 47define(`m2b',`r21') 48define(`m3a',`r22') 49define(`m3b',`r23') 50 51define(`acc0',`r25') 52define(`acc1',`r27') 53 54define(`ul0',`r4') 55define(`ul1',`r5') 56define(`ul2',`r4') 57define(`ul3',`r5') 58 59define(`rl0',`r24') 60define(`rl1',`r24') 61define(`rl2',`r24') 62define(`rl3',`r24') 63 64define(`t0',`r7') 65define(`t1',`r8') 66 67define(`NAIL_BITS',`GMP_NAIL_BITS') 68define(`NUMB_BITS',`GMP_NUMB_BITS') 69 70dnl This declaration is munged by configure 71NAILS_SUPPORT(2-63) 72 73ASM_START() 74PROLOGUE(mpn_submul_1) 75 sll vl0, NAIL_BITS, vl0 76 lda numb_mask, -1(r31) 77 srl numb_mask, NAIL_BITS, numb_mask 78 79 and n, 3, r25 80 cmpeq r25, 1, r21 81 bne r21, L(1m4) 82 cmpeq r25, 2, r21 83 bne r21, L(2m4) 84 beq r25, L(0m4) 85 86L(3m4): ldq ul3, 0(up) 87 lda n, -4(n) 88 ldq ul0, 8(up) 89 mulq vl0, ul3, m3a 90 umulh vl0, ul3, m3b 91 ldq ul1, 16(up) 92 lda up, 24(up) 93 lda rp, -8(rp) 94 mulq vl0, ul0, m0a 95 umulh vl0, ul0, m0b 96 bge n, L(ge3) 97 98 mulq vl0, ul1, m1a 99 umulh vl0, ul1, m1b 100 ldq rl3, 8(rp) 101 srl m3a,NAIL_BITS, t0 102 addq t0, r31, acc1 103 subq rl3, acc1, acc1 104 ldq rl0, 16(rp) 105 srl m0a,NAIL_BITS, t0 106 addq t0, m3b, acc0 107 sra acc1,NUMB_BITS, t1 108 br r31, L(ta3) 109 110L(ge3): ldq ul2, 0(up) 111 mulq vl0, ul1, m1a 112 umulh vl0, ul1, m1b 113 ldq rl3, 8(rp) 114 srl m3a,NAIL_BITS, t0 115 ldq ul3, 8(up) 116 lda n, -4(n) 117 mulq vl0, ul2, m2a 118 addq t0, r31, acc1 119 umulh vl0, ul2, m2b 120 subq rl3, acc1, acc1 121 ldq rl0, 16(rp) 122 srl m0a,NAIL_BITS, t0 123 ldq ul0, 16(up) 124 mulq vl0, ul3, m3a 125 addq t0, m3b, acc0 126 sra acc1,NUMB_BITS, t1 127 br r31, L(el3) 128 129L(0m4): lda n, -8(n) 130 ldq ul2, 0(up) 131 ldq ul3, 8(up) 132 mulq vl0, ul2, m2a 133 umulh vl0, ul2, m2b 134 ldq ul0, 16(up) 135 mulq vl0, ul3, m3a 136 umulh vl0, ul3, m3b 137 ldq ul1, 24(up) 138 lda up, 32(up) 139 mulq vl0, ul0, m0a 140 umulh vl0, ul0, m0b 141 bge n, L(ge4) 142 143 ldq rl2, 0(rp) 144 srl m2a,NAIL_BITS, t0 145 mulq vl0, ul1, m1a 146 addq t0, r31, acc0 147 umulh vl0, ul1, m1b 148 subq rl2, acc0, acc0 149 ldq rl3, 8(rp) 150 srl m3a,NAIL_BITS, t0 151 addq t0, m2b, acc1 152 sra acc0,NUMB_BITS, t1 153 br r31, L(ta4) 154 155L(ge4): ldq rl2, 0(rp) 156 srl m2a,NAIL_BITS, t0 157 ldq ul2, 0(up) 158 mulq vl0, ul1, m1a 159 addq t0, r31, acc0 160 umulh vl0, ul1, m1b 161 subq rl2, acc0, acc0 162 ldq rl3, 8(rp) 163 srl m3a,NAIL_BITS, t0 164 ldq ul3, 8(up) 165 lda n, -4(n) 166 mulq vl0, ul2, m2a 167 addq t0, m2b, acc1 168 sra acc0,NUMB_BITS, t1 169 br r31, L(el0) 170 171L(2m4): lda n, -4(n) 172 ldq ul0, 0(up) 173 ldq ul1, 8(up) 174 lda up, 16(up) 175 lda rp, -16(rp) 176 mulq vl0, ul0, m0a 177 umulh vl0, ul0, m0b 178 bge n, L(ge2) 179 180 mulq vl0, ul1, m1a 181 umulh vl0, ul1, m1b 182 ldq rl0, 16(rp) 183 srl m0a,NAIL_BITS, t0 184 addq t0, r31, acc0 185 subq rl0, acc0, acc0 186 ldq rl1, 24(rp) 187 srl m1a,NAIL_BITS, t0 188 addq t0, m0b, acc1 189 sra acc0,NUMB_BITS, t1 190 br r31, L(ta2) 191 192L(ge2): ldq ul2, 0(up) 193 mulq vl0, ul1, m1a 194 umulh vl0, ul1, m1b 195 ldq ul3, 8(up) 196 lda n, -4(n) 197 mulq vl0, ul2, m2a 198 umulh vl0, ul2, m2b 199 ldq rl0, 16(rp) 200 srl m0a,NAIL_BITS, t0 201 ldq ul0, 16(up) 202 mulq vl0, ul3, m3a 203 addq t0, r31, acc0 204 umulh vl0, ul3, m3b 205 subq rl0, acc0, acc0 206 ldq rl1, 24(rp) 207 srl m1a,NAIL_BITS, t0 208 ldq ul1, 24(up) 209 lda up, 32(up) 210 lda rp, 32(rp) 211 mulq vl0, ul0, m0a 212 addq t0, m0b, acc1 213 sra acc0,NUMB_BITS, t1 214 bge n, L(el2) 215 216 br r31, L(ta6) 217 218L(1m4): lda n, -4(n) 219 ldq ul1, 0(up) 220 lda up, 8(up) 221 lda rp, -24(rp) 222 bge n, L(ge1) 223 224 mulq vl0, ul1, m1a 225 umulh vl0, ul1, m1b 226 ldq rl1, 24(rp) 227 srl m1a,NAIL_BITS, t0 228 subq rl1, t0, acc1 229 and acc1,numb_mask, r28 230 sra acc1,NUMB_BITS, t1 231 stq r28, 24(rp) 232 subq m1b, t1, r0 233 ret r31, (r26), 1 234 235L(ge1): ldq ul2, 0(up) 236 mulq vl0, ul1, m1a 237 umulh vl0, ul1, m1b 238 ldq ul3, 8(up) 239 lda n, -4(n) 240 mulq vl0, ul2, m2a 241 umulh vl0, ul2, m2b 242 ldq ul0, 16(up) 243 mulq vl0, ul3, m3a 244 umulh vl0, ul3, m3b 245 ldq rl1, 24(rp) 246 srl m1a,NAIL_BITS, t0 247 ldq ul1, 24(up) 248 lda up, 32(up) 249 lda rp, 32(rp) 250 mulq vl0, ul0, m0a 251 addq t0, r31, acc1 252 umulh vl0, ul0, m0b 253 subq rl1, acc1, acc1 254 ldq rl2, 0(rp) 255 srl m2a,NAIL_BITS, t0 256 mulq vl0, ul1, m1a 257 addq t0, m1b, acc0 258 sra acc1,NUMB_BITS, t1 259 blt n, L(ta5) 260 261L(ge5): ldq ul2, 0(up) 262 br r31, L(el1) 263 264 ALIGN(16) 265L(top): mulq vl0, ul0, m0a C U1 266 addq t0, m0b, acc1 C L0 267 sra acc0,NUMB_BITS, t1 C U0 268 stq r28, -24(rp) C L1 269C 270L(el2): umulh vl0, ul0, m0b C U1 271 and acc0,numb_mask, r28 C L0 272 subq rl1, acc1, acc1 C U0 273 ldq rl2, 0(rp) C L1 274C 275 unop C U1 276 addq t1, acc1, acc1 C L0 277 srl m2a,NAIL_BITS, t0 C U0 278 ldq ul2, 0(up) C L1 279C 280 mulq vl0, ul1, m1a C U1 281 addq t0, m1b, acc0 C L0 282 sra acc1,NUMB_BITS, t1 C U0 283 stq r28, -16(rp) C L1 284C 285L(el1): umulh vl0, ul1, m1b C U1 286 and acc1,numb_mask, r28 C L0 287 subq rl2, acc0, acc0 C U0 288 ldq rl3, 8(rp) C L1 289C 290 lda n, -4(n) C L1 291 addq t1, acc0, acc0 C L0 292 srl m3a,NAIL_BITS, t0 C U0 293 ldq ul3, 8(up) C L1 294C 295 mulq vl0, ul2, m2a C U1 296 addq t0, m2b, acc1 C L0 297 sra acc0,NUMB_BITS, t1 C U0 298 stq r28, -8(rp) C L1 299C 300L(el0): umulh vl0, ul2, m2b C U1 301 and acc0,numb_mask, r28 C L0 302 subq rl3, acc1, acc1 C U0 303 ldq rl0, 16(rp) C L1 304C 305 unop C U1 306 addq t1, acc1, acc1 C L0 307 srl m0a,NAIL_BITS, t0 C U0 308 ldq ul0, 16(up) C L1 309C 310 mulq vl0, ul3, m3a C U1 311 addq t0, m3b, acc0 C L0 312 sra acc1,NUMB_BITS, t1 C U0 313 stq r28, 0(rp) C L1 314C 315L(el3): umulh vl0, ul3, m3b C U1 316 and acc1,numb_mask, r28 C L0 317 subq rl0, acc0, acc0 C U0 318 ldq rl1, 24(rp) C L1 319C 320 unop C U1 321 addq t1, acc0, acc0 C L0 322 srl m1a,NAIL_BITS, t0 C U0 323 ldq ul1, 24(up) C L1 324C 325 lda up, 32(up) C L0 326 unop C U1 327 lda rp, 32(rp) C L1 328 bge n, L(top) C U0 329 330L(end): mulq vl0, ul0, m0a 331 addq t0, m0b, acc1 332 sra acc0,NUMB_BITS, t1 333 stq r28, -24(rp) 334L(ta6): umulh vl0, ul0, m0b 335 and acc0,numb_mask, r28 336 subq rl1, acc1, acc1 337 ldq rl2, 0(rp) 338 addq t1, acc1, acc1 339 srl m2a,NAIL_BITS, t0 340 mulq vl0, ul1, m1a 341 addq t0, m1b, acc0 342 sra acc1,NUMB_BITS, t1 343 stq r28, -16(rp) 344L(ta5): umulh vl0, ul1, m1b 345 and acc1,numb_mask, r28 346 subq rl2, acc0, acc0 347 ldq rl3, 8(rp) 348 addq t1, acc0, acc0 349 srl m3a,NAIL_BITS, t0 350 addq t0, m2b, acc1 351 sra acc0,NUMB_BITS, t1 352 stq r28, -8(rp) 353 unop 354 ALIGN(16) 355L(ta4): and acc0,numb_mask, r28 356 subq rl3, acc1, acc1 357 ldq rl0, 16(rp) 358 addq t1, acc1, acc1 359 srl m0a,NAIL_BITS, t0 360 addq t0, m3b, acc0 361 sra acc1,NUMB_BITS, t1 362 stq r28, 0(rp) 363 unop 364 ALIGN(16) 365L(ta3): and acc1,numb_mask, r28 366 subq rl0, acc0, acc0 367 ldq rl1, 24(rp) 368 addq t1, acc0, acc0 369 srl m1a,NAIL_BITS, t0 370 addq t0, m0b, acc1 371 sra acc0,NUMB_BITS, t1 372 stq r28, 8(rp) 373 unop 374 ALIGN(16) 375L(ta2): and acc0,numb_mask, r28 376 subq rl1, acc1, acc1 377 addq t1, acc1, acc1 378 sra acc1,NUMB_BITS, t1 379 stq r28, 16(rp) 380 and acc1,numb_mask, r28 381 subq m1b, t1, r0 382 stq r28, 24(rp) 383 ret r31, (r26), 1 384EPILOGUE() 385ASM_END() 386