1dnl Alpha ev6 nails mpn_mul_1. 2 3dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C EV4: 42 24C EV5: 18 25C EV6: 3.25 26 27C TODO 28C * Reroll loop for 3.0 c/l with current 4-way unrolling. 29C * The loop is overscheduled wrt loads and wrt multiplies, in particular 30C umulh. 31C * Use FP loop count and multiple exit points, that would simplify feed-in lp0 32C and would work since the loop structure is really regular. 33 34C INPUT PARAMETERS 35define(`rp',`r16') 36define(`up',`r17') 37define(`n', `r18') 38define(`vl0',`r19') 39 40define(`numb_mask',`r6') 41 42define(`m0a',`r0') 43define(`m0b',`r1') 44define(`m1a',`r2') 45define(`m1b',`r3') 46define(`m2a',`r20') 47define(`m2b',`r21') 48define(`m3a',`r22') 49define(`m3b',`r23') 50 51define(`acc0',`r25') 52define(`acc1',`r27') 53 54define(`ul0',`r4') 55define(`ul1',`r5') 56define(`ul2',`r4') 57define(`ul3',`r5') 58 59define(`rl0',`r24') 60define(`rl1',`r24') 61define(`rl2',`r24') 62define(`rl3',`r24') 63 64define(`t0',`r7') 65define(`t1',`r8') 66 67define(`NAIL_BITS',`GMP_NAIL_BITS') 68define(`NUMB_BITS',`GMP_NUMB_BITS') 69 70dnl This declaration is munged by configure 71NAILS_SUPPORT(1-63) 72 73ASM_START() 74PROLOGUE(mpn_mul_1) 75 sll vl0, NAIL_BITS, vl0 76 lda numb_mask, -1(r31) 77 srl numb_mask, NAIL_BITS, numb_mask 78 79 and n, 3, r25 80 cmpeq r25, 1, r21 81 bne r21, L(1m4) 82 cmpeq r25, 2, r21 83 bne r21, L(2m4) 84 beq r25, L(0m4) 85 86L(3m4): ldq ul3, 0(up) 87 lda n, -4(n) 88 ldq ul0, 8(up) 89 mulq vl0, ul3, m3a 90 umulh vl0, ul3, m3b 91 ldq ul1, 16(up) 92 lda up, 24(up) 93 lda rp, -8(rp) 94 mulq vl0, ul0, m0a 95 umulh vl0, ul0, m0b 96 bge n, L(ge3) 97 98 mulq vl0, ul1, m1a 99 umulh vl0, ul1, m1b 100 srl m3a,NAIL_BITS, t0 101 addq t0, r31, acc1 102 srl m0a,NAIL_BITS, t0 103 addq t0, m3b, acc0 104 srl acc1,NUMB_BITS, t1 105 br r31, L(ta3) 106 107L(ge3): ldq ul2, 0(up) 108 mulq vl0, ul1, m1a 109 umulh vl0, ul1, m1b 110 srl m3a,NAIL_BITS, t0 111 ldq ul3, 8(up) 112 lda n, -4(n) 113 mulq vl0, ul2, m2a 114 addq t0, r31, acc1 115 umulh vl0, ul2, m2b 116 srl m0a,NAIL_BITS, t0 117 ldq ul0, 16(up) 118 mulq vl0, ul3, m3a 119 addq t0, m3b, acc0 120 srl acc1,NUMB_BITS, t1 121 br r31, L(el3) 122 123L(0m4): lda n, -8(n) 124 ldq ul2, 0(up) 125 ldq ul3, 8(up) 126 mulq vl0, ul2, m2a 127 umulh vl0, ul2, m2b 128 ldq ul0, 16(up) 129 mulq vl0, ul3, m3a 130 umulh vl0, ul3, m3b 131 ldq ul1, 24(up) 132 lda up, 32(up) 133 mulq vl0, ul0, m0a 134 umulh vl0, ul0, m0b 135 bge n, L(ge4) 136 137 srl m2a,NAIL_BITS, t0 138 mulq vl0, ul1, m1a 139 addq t0, r31, acc0 140 umulh vl0, ul1, m1b 141 srl m3a,NAIL_BITS, t0 142 addq t0, m2b, acc1 143 srl acc0,NUMB_BITS, t1 144 br r31, L(ta4) 145 146L(ge4): srl m2a,NAIL_BITS, t0 147 ldq ul2, 0(up) 148 mulq vl0, ul1, m1a 149 addq t0, r31, acc0 150 umulh vl0, ul1, m1b 151 srl m3a,NAIL_BITS, t0 152 ldq ul3, 8(up) 153 lda n, -4(n) 154 mulq vl0, ul2, m2a 155 addq t0, m2b, acc1 156 srl acc0,NUMB_BITS, t1 157 br r31, L(el0) 158 159L(2m4): lda n, -4(n) 160 ldq ul0, 0(up) 161 ldq ul1, 8(up) 162 lda up, 16(up) 163 lda rp, -16(rp) 164 mulq vl0, ul0, m0a 165 umulh vl0, ul0, m0b 166 bge n, L(ge2) 167 168 mulq vl0, ul1, m1a 169 umulh vl0, ul1, m1b 170 srl m0a,NAIL_BITS, t0 171 addq t0, r31, acc0 172 srl m1a,NAIL_BITS, t0 173 addq t0, m0b, acc1 174 srl acc0,NUMB_BITS, t1 175 br r31, L(ta2) 176 177L(ge2): ldq ul2, 0(up) 178 mulq vl0, ul1, m1a 179 umulh vl0, ul1, m1b 180 ldq ul3, 8(up) 181 lda n, -4(n) 182 mulq vl0, ul2, m2a 183 umulh vl0, ul2, m2b 184 srl m0a,NAIL_BITS, t0 185 ldq ul0, 16(up) 186 mulq vl0, ul3, m3a 187 addq t0, r31, acc0 188 umulh vl0, ul3, m3b 189 srl m1a,NAIL_BITS, t0 190 ldq ul1, 24(up) 191 lda up, 32(up) 192 lda rp, 32(rp) 193 mulq vl0, ul0, m0a 194 addq t0, m0b, acc1 195 srl acc0,NUMB_BITS, t1 196 bge n, L(el2) 197 198 br r31, L(ta6) 199 200L(1m4): lda n, -4(n) 201 ldq ul1, 0(up) 202 lda up, 8(up) 203 lda rp, -24(rp) 204 bge n, L(ge1) 205 206 mulq vl0, ul1, m1a 207 umulh vl0, ul1, m1b 208 srl m1a,NAIL_BITS, t0 209 addq t0, r31, acc1 210 and acc1,numb_mask, r28 211 srl acc1,NUMB_BITS, t1 212 stq r28, 24(rp) 213 addq t1, m1b, r0 214 ret r31, (r26), 1 215 216L(ge1): ldq ul2, 0(up) 217 mulq vl0, ul1, m1a 218 umulh vl0, ul1, m1b 219 ldq ul3, 8(up) 220 lda n, -4(n) 221 mulq vl0, ul2, m2a 222 umulh vl0, ul2, m2b 223 ldq ul0, 16(up) 224 mulq vl0, ul3, m3a 225 umulh vl0, ul3, m3b 226 srl m1a,NAIL_BITS, t0 227 ldq ul1, 24(up) 228 lda up, 32(up) 229 lda rp, 32(rp) 230 mulq vl0, ul0, m0a 231 addq t0, r31, acc1 232 umulh vl0, ul0, m0b 233 srl m2a,NAIL_BITS, t0 234 mulq vl0, ul1, m1a 235 addq t0, m1b, acc0 236 srl acc1,NUMB_BITS, t1 237 blt n, L(ta5) 238 239L(ge5): ldq ul2, 0(up) 240 br r31, L(el1) 241 242 ALIGN(16) 243L(top): mulq vl0, ul0, m0a C U1 244 addq t0, m0b, acc1 C L0 245 srl acc0,NUMB_BITS, t1 C U0 246 stq r28, -24(rp) C L1 247C 248L(el2): umulh vl0, ul0, m0b C U1 249 and acc0,numb_mask, r28 C L0 250 unop C U0 251 unop C L1 252C 253 unop C U1 254 addq t1, acc1, acc1 C L0 255 srl m2a,NAIL_BITS, t0 C U0 256 ldq ul2, 0(up) C L1 257C 258 mulq vl0, ul1, m1a C U1 259 addq t0, m1b, acc0 C L0 260 srl acc1,NUMB_BITS, t1 C U0 261 stq r28, -16(rp) C L1 262C 263L(el1): umulh vl0, ul1, m1b C U1 264 and acc1,numb_mask, r28 C L0 265 unop C U0 266 lda n, -4(n) C L1 267C 268 unop C U1 269 addq t1, acc0, acc0 C L0 270 srl m3a,NAIL_BITS, t0 C U0 271 ldq ul3, 8(up) C L1 272C 273 mulq vl0, ul2, m2a C U1 274 addq t0, m2b, acc1 C L0 275 srl acc0,NUMB_BITS, t1 C U0 276 stq r28, -8(rp) C L1 277C 278L(el0): umulh vl0, ul2, m2b C U1 279 and acc0,numb_mask, r28 C L0 280 unop C U0 281 unop C L1 282C 283 unop C U1 284 addq t1, acc1, acc1 C L0 285 srl m0a,NAIL_BITS, t0 C U0 286 ldq ul0, 16(up) C L1 287C 288 mulq vl0, ul3, m3a C U1 289 addq t0, m3b, acc0 C L0 290 srl acc1,NUMB_BITS, t1 C U0 291 stq r28, 0(rp) C L1 292C 293L(el3): umulh vl0, ul3, m3b C U1 294 and acc1,numb_mask, r28 C L0 295 unop C U0 296 unop C L1 297C 298 unop C U1 299 addq t1, acc0, acc0 C L0 300 srl m1a,NAIL_BITS, t0 C U0 301 ldq ul1, 24(up) C L1 302C 303 lda up, 32(up) C L0 304 unop C U1 305 lda rp, 32(rp) C L1 306 bge n, L(top) C U0 307 308L(end): mulq vl0, ul0, m0a 309 addq t0, m0b, acc1 310 srl acc0,NUMB_BITS, t1 311 stq r28, -24(rp) 312L(ta6): umulh vl0, ul0, m0b 313 and acc0,numb_mask, r28 314 addq t1, acc1, acc1 315 srl m2a,NAIL_BITS, t0 316 mulq vl0, ul1, m1a 317 addq t0, m1b, acc0 318 srl acc1,NUMB_BITS, t1 319 stq r28, -16(rp) 320L(ta5): umulh vl0, ul1, m1b 321 and acc1,numb_mask, r28 322 addq t1, acc0, acc0 323 srl m3a,NAIL_BITS, t0 324 addq t0, m2b, acc1 325 srl acc0,NUMB_BITS, t1 326 stq r28, -8(rp) 327 ALIGN(16) 328L(ta4): and acc0,numb_mask, r28 329 addq t1, acc1, acc1 330 srl m0a,NAIL_BITS, t0 331 addq t0, m3b, acc0 332 srl acc1,NUMB_BITS, t1 333 stq r28, 0(rp) 334 unop 335 ALIGN(16) 336L(ta3): and acc1,numb_mask, r28 337 addq t1, acc0, acc0 338 srl m1a,NAIL_BITS, t0 339 addq t0, m0b, acc1 340 srl acc0,NUMB_BITS, t1 341 stq r28, 8(rp) 342 unop 343 ALIGN(16) 344L(ta2): and acc0,numb_mask, r28 345 addq t1, acc1, acc1 346 srl acc1,NUMB_BITS, t1 347 stq r28, 16(rp) 348 and acc1,numb_mask, r28 349 addq t1, m1b, r0 350 stq r28, 24(rp) 351 ret r31, (r26), 1 352EPILOGUE() 353ASM_END() 354