1dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb. 2 3dnl Copyright 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C norm unorm frac 24C POWER3/PPC630 16-34 16-34 ~11 25C POWER4/PPC970 29 19 26C POWER5 29 29 ~20 27 28C INPUT PARAMETERS 29C qp = r3 30C fn = r4 31C up = r5 32C un = r6 33C d = r7 34 35C We use a not very predictable branch in the frac code, therefore the cycle 36C count wobbles somewhat. With the alternative branch-free code, things run 37C considerably slower on POWER4/PPC970 and POWER5. 38 39C Add preinv entry point. 40 41 42ASM_START() 43 44EXTERN_FUNC(mpn_invert_limb) 45 46PROLOGUE(mpn_divrem_1) 47 48 mfcr r12 49 add. r10, r6, r4 50 std r25, -56(r1) 51 mr r25, r4 52 mflr r0 53 std r26, -48(r1) 54 mr r26, r5 55 std r28, -32(r1) 56 mr r28, r6 57 std r29, -24(r1) 58 mr r29, r3 59 li r3, 0 60 std r30, -16(r1) 61 mr r30, r7 62 std r31, -8(r1) 63 li r31, 0 64 std r27, -40(r1) 65 std r0, 16(r1) 66 stw r12, 8(r1) 67 stdu r1, -176(r1) 68 beq- cr0, L(1) 69 cmpdi cr7, r7, 0 70 sldi r0, r10, 3 71 add r11, r0, r29 72 addi r29, r11, -8 73 blt- cr7, L(162) 74 cmpdi cr4, r6, 0 75 beq+ cr4, L(71) 76L(163): 77 sldi r9, r6, 3 78 add r9, r9, r5 79 ld r7, -8(r9) 80 cmpld cr7, r7, r30 81 bge- cr7, L(71) 82 cmpdi cr7, r10, 1 83 li r0, 0 84 mr r31, r7 85 std r0, -8(r11) 86 addi r29, r29, -8 87 mr r3, r7 88 beq- cr7, L(1) 89 addi r28, r6, -1 90 cmpdi cr4, r28, 0 91L(71): 92 cntlzd r27, r30 93 sld r30, r30, r27 94 sld r31, r31, r27 95 mr r3, r30 96 CALL( mpn_invert_limb) 97 nop 98 beq- cr4, L(110) 99 sldi r9, r28, 3 100 addic. r6, r28, -2 101 add r9, r9, r26 102 subfic r5, r27, 64 103 ld r8, -8(r9) 104 srd r0, r8, r5 105 or r31, r31, r0 106 sld r7, r8, r27 107 blt- cr0, L(154) 108 addi r28, r28, -1 109 mtctr r28 110 sldi r6, r6, 3 111 ALIGN(16) 112L(uloop): 113 addi r11, r31, 1 114 ldx r8, r26, r6 115 mulld r0, r31, r3 116 mulhdu r10, r31, r3 117 addi r6, r6, -8 118 srd r9, r8, r5 119 or r9, r7, r9 120 addc r0, r0, r9 121 adde r10, r10, r11 122 mulld r31, r10, r30 123 subf r31, r31, r9 124 subfc r0, r0, r31 C r >= ql 125 subfe r0, r0, r0 C r0 = -(r >= ql) 126 not r7, r0 127 add r10, r7, r10 C qh -= (r >= ql) 128 andc r0, r30, r0 129 add r31, r31, r0 130 cmpld cr7, r31, r30 131 bge- cr7, L(164) 132L(123): 133 std r10, 0(r29) 134 addi r29, r29, -8 135 sld r7, r8, r27 136 bdnz L(uloop) 137L(154): 138 addi r11, r31, 1 139 nop 140 mulld r0, r31, r3 141 mulhdu r8, r31, r3 142 addc r0, r0, r7 143 adde r8, r8, r11 144 mulld r31, r8, r30 145 subf r31, r31, r7 146 subfc r0, r0, r31 C r >= ql 147 subfe r0, r0, r0 C r0 = -(r >= ql) 148 not r7, r0 149 add r8, r7, r8 C qh -= (r >= ql) 150 andc r0, r30, r0 151 add r31, r31, r0 152 cmpld cr7, r31, r30 153 bge- cr7, L(165) 154L(134): 155 std r8, 0(r29) 156 addi r29, r29, -8 157L(110): 158 addic. r0, r25, -1 159 blt- cr0, L(156) 160 mtctr r25 161 neg r9, r30 162 ALIGN(16) 163L(ufloop): 164 addi r11, r31, 1 165 nop 166 mulld r7, r3, r31 167 mulhdu r10, r3, r31 168 add r10, r10, r11 169 mulld r31, r9, r10 170ifelse(0,1,` 171 subfc r0, r7, r31 172 subfe r0, r0, r0 C r0 = -(r >= ql) 173 not r7, r0 174 add r10, r7, r10 C qh -= (r >= ql) 175 andc r0, r30, r0 176 add r31, r31, r0 177',` 178 cmpld cr7, r31, r7 179 blt cr7, L(29) 180 add r31, r30, r31 181 addi r10, r10, -1 182L(29): 183') 184 std r10, 0(r29) 185 addi r29, r29, -8 186 bdnz L(ufloop) 187L(156): 188 srd r3, r31, r27 189L(1): 190 addi r1, r1, 176 191 ld r0, 16(r1) 192 lwz r12, 8(r1) 193 mtlr r0 194 ld r25, -56(r1) 195 ld r26, -48(r1) 196 mtcrf 8, r12 197 ld r27, -40(r1) 198 ld r28, -32(r1) 199 ld r29, -24(r1) 200 ld r30, -16(r1) 201 ld r31, -8(r1) 202 blr 203L(162): 204 cmpdi cr7, r6, 0 205 beq- cr7, L(8) 206 sldi r9, r6, 3 207 addi r29, r29, -8 208 add r9, r9, r5 209 addi r28, r6, -1 210 ld r31, -8(r9) 211 subfc r9, r7, r31 212 li r9, 0 213 adde r9, r9, r9 214 neg r0, r9 215 std r9, -8(r11) 216 and r0, r0, r7 217 subf r31, r0, r31 218L(8): 219L(10): 220 mr r3, r30 221 CALL( mpn_invert_limb) 222 nop 223 addic. r6, r28, -1 224 blt- cr0, L(150) 225 mtctr r28 226 sldi r6, r6, 3 227 ALIGN(16) 228L(nloop): 229 addi r11, r31, 1 230 ldx r8, r26, r6 231 mulld r0, r31, r3 232 addi r6, r6, -8 233 mulhdu r10, r31, r3 234 addc r7, r0, r8 235 adde r10, r10, r11 236 mulld r31, r10, r30 237 subf r31, r31, r8 C r = nl - qh * d 238 subfc r0, r7, r31 C r >= ql 239 subfe r0, r0, r0 C r0 = -(r >= ql) 240 not r7, r0 241 add r10, r7, r10 C qh -= (r >= ql) 242 andc r0, r30, r0 243 add r31, r31, r0 244 cmpld cr7, r31, r30 245 bge- cr7, L(167) 246L(51): 247 std r10, 0(r29) 248 addi r29, r29, -8 249 bdnz L(nloop) 250 251L(150): 252 addic. r9, r25, -1 253 blt- cr0, L(152) 254 mtctr r25 255 neg r9, r30 256 ALIGN(16) 257L(nfloop): 258 addi r11, r31, 1 259 nop 260 mulld r7, r3, r31 261 mulhdu r10, r3, r31 262 add r10, r10, r11 263 mulld r31, r9, r10 264ifelse(0,1,` 265 subfc r0, r7, r31 266 subfe r0, r0, r0 C r0 = -(r >= ql) 267 not r7, r0 268 add r10, r7, r10 C qh -= (r >= ql) 269 andc r0, r30, r0 270 add r31, r31, r0 271',` 272 cmpld cr7, r31, r7 273 blt cr7, L(28) 274 add r31, r30, r31 275 addi r10, r10, -1 276L(28): 277') 278 std r10, 0(r29) 279 addi r29, r29, -8 280 bdnz L(nfloop) 281L(152): 282 addi r1, r1, 176 283 mr r3, r31 284 ld r0, 16(r1) 285 lwz r12, 8(r1) 286 mtlr r0 287 ld r25, -56(r1) 288 ld r26, -48(r1) 289 mtcrf 8, r12 290 ld r27, -40(r1) 291 ld r28, -32(r1) 292 ld r29, -24(r1) 293 ld r30, -16(r1) 294 ld r31, -8(r1) 295 blr 296L(164): 297 subf r31, r30, r31 298 addi r10, r10, 1 299 b L(123) 300L(167): 301 subf r31, r30, r31 302 addi r10, r10, 1 303 b L(51) 304L(165): 305 subf r31, r30, r31 306 addi r8, r8, 1 307 b L(134) 308EPILOGUE() 309