1dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb. 2 3dnl Copyright 2003, 2004, 2005, 2007, 2008, 2010, 2012 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C norm unorm frac 25C POWER3/PPC630 16-34 16-34 ~11 outdated figures 26C POWER4/PPC970 28 28 19 27C POWER5 29 29 ~19 28C POWER6 49 59 ~42 29C POWER7 24.5 23 ~14 30 31C INPUT PARAMETERS 32C qp = r3 33C fn = r4 34C up = r5 35C un = r6 36C d = r7 37 38C We use a not very predictable branch in the frac code, therefore the cycle 39C count wobbles somewhat. With the alternative branch-free code, things run 40C considerably slower on POWER4/PPC970 and POWER5. 41 42C Add preinv entry point. 43 44 45ASM_START() 46 47EXTERN_FUNC(mpn_invert_limb) 48 49PROLOGUE(mpn_divrem_1) 50 51 mfcr r12 52 add. r10, r6, r4 53 std r25, -56(r1) 54 mr r25, r4 55 mflr r0 56 std r26, -48(r1) 57 mr r26, r5 58 std r28, -32(r1) 59 mr r28, r6 60 std r29, -24(r1) 61 mr r29, r3 62 li r3, 0 63 std r30, -16(r1) 64 mr r30, r7 65 std r31, -8(r1) 66 li r31, 0 67 std r27, -40(r1) 68 std r0, 16(r1) 69 stw r12, 8(r1) 70 stdu r1, -176(r1) 71 beq- cr0, L(1) 72 cmpdi cr7, r7, 0 73 sldi r0, r10, 3 74 add r11, r0, r29 75 addi r29, r11, -8 76 blt- cr7, L(162) 77 cmpdi cr4, r6, 0 78 beq+ cr4, L(71) 79L(163): 80 sldi r9, r6, 3 81 add r9, r9, r5 82 ld r7, -8(r9) 83 cmpld cr7, r7, r30 84 bge- cr7, L(71) 85 cmpdi cr7, r10, 1 86 li r0, 0 87 mr r31, r7 88 std r0, -8(r11) 89 addi r29, r29, -8 90 mr r3, r7 91 beq- cr7, L(1) 92 addi r28, r6, -1 93 cmpdi cr4, r28, 0 94L(71): 95 cntlzd r27, r30 96 sld r30, r30, r27 97 sld r31, r31, r27 98 mr r3, r30 99 CALL( mpn_invert_limb) 100 nop 101 beq- cr4, L(110) 102 sldi r9, r28, 3 103 addic. r6, r28, -2 104 add r9, r9, r26 105 subfic r5, r27, 64 106 ld r8, -8(r9) 107 srd r0, r8, r5 108 or r31, r31, r0 109 sld r7, r8, r27 110 blt- cr0, L(154) 111 addi r28, r28, -1 112 mtctr r28 113 sldi r6, r6, 3 114 ALIGN(16) 115L(uloop): 116 ldx r8, r26, r6 117 nop 118 mulld r0, r31, r3 119 mulhdu r10, r31, r3 120 addi r11, r31, 1 121 srd r9, r8, r5 122 addi r6, r6, -8 123 or r9, r7, r9 124 addc r0, r0, r9 125 adde r10, r10, r11 126 mulld r31, r10, r30 127 subf r31, r31, r9 128 subfc r0, r31, r0 C r <= ql 129 subfe r0, r0, r0 C r0 = -(r <= ql) 130 and r9, r30, r0 131 add r31, r31, r9 132 add r10, r0, r10 C qh -= (r >= ql) 133 cmpld cr7, r31, r30 134 bge- cr7, L(164) 135L(123): 136 std r10, 0(r29) 137 addi r29, r29, -8 138 sld r7, r8, r27 139 bdnz L(uloop) 140L(154): 141 addi r11, r31, 1 142 nop 143 mulld r0, r31, r3 144 mulhdu r8, r31, r3 145 addc r0, r0, r7 146 adde r8, r8, r11 147 mulld r31, r8, r30 148 subf r31, r31, r7 149 subfc r0, r0, r31 C r >= ql 150 subfe r0, r0, r0 C r0 = -(r >= ql) 151 not r7, r0 152 add r8, r7, r8 C qh -= (r >= ql) 153 andc r0, r30, r0 154 add r31, r31, r0 155 cmpld cr7, r31, r30 156 bge- cr7, L(165) 157L(134): 158 std r8, 0(r29) 159 addi r29, r29, -8 160L(110): 161 addic. r0, r25, -1 162 blt- cr0, L(156) 163 mtctr r25 164 neg r9, r30 165 ALIGN(16) 166L(ufloop): 167 addi r11, r31, 1 168 nop 169 mulld r0, r3, r31 170 mulhdu r10, r3, r31 171 add r10, r10, r11 172 mulld r31, r9, r10 173ifelse(0,1,` 174 subfc r0, r0, r31 175 subfe r0, r0, r0 C r0 = -(r >= ql) 176 not r7, r0 177 add r10, r7, r10 C qh -= (r >= ql) 178 andc r0, r30, r0 179 add r31, r31, r0 180',` 181 cmpld cr7, r31, r0 182 blt cr7, L(29) 183 add r31, r30, r31 184 addi r10, r10, -1 185L(29): 186') 187 std r10, 0(r29) 188 addi r29, r29, -8 189 bdnz L(ufloop) 190L(156): 191 srd r3, r31, r27 192L(1): 193 addi r1, r1, 176 194 ld r0, 16(r1) 195 lwz r12, 8(r1) 196 mtlr r0 197 ld r25, -56(r1) 198 ld r26, -48(r1) 199 mtcrf 8, r12 200 ld r27, -40(r1) 201 ld r28, -32(r1) 202 ld r29, -24(r1) 203 ld r30, -16(r1) 204 ld r31, -8(r1) 205 blr 206L(162): 207 cmpdi cr7, r6, 0 208 beq- cr7, L(8) 209 sldi r9, r6, 3 210 addi r29, r29, -8 211 add r9, r9, r5 212 addi r28, r6, -1 213 ld r31, -8(r9) 214 subfc r9, r7, r31 215 li r9, 0 216 adde r9, r9, r9 217 neg r0, r9 218 std r9, -8(r11) 219 and r0, r0, r7 220 subf r31, r0, r31 221L(8): 222 mr r3, r30 223 CALL( mpn_invert_limb) 224 li r27, 0 225 addic. r6, r28, -1 226 blt- cr0, L(110) 227 mtctr r28 228 sldi r6, r6, 3 229 ALIGN(16) 230L(nloop): 231 addi r11, r31, 1 232 ldx r8, r26, r6 233 mulld r0, r31, r3 234 mulhdu r10, r31, r3 235 addi r6, r6, -8 236 addc r0, r0, r8 237 adde r10, r10, r11 238 mulld r31, r10, r30 239 subf r31, r31, r8 C r = nl - qh * d 240 subfc r0, r31, r0 C r <= ql 241 subfe r0, r0, r0 C r0 = -(r <= ql) 242 and r9, r30, r0 243 add r31, r31, r9 244 add r10, r0, r10 C qh -= (r >= ql) 245 cmpld cr7, r31, r30 246 bge- cr7, L(167) 247L(51): 248 std r10, 0(r29) 249 addi r29, r29, -8 250 bdnz L(nloop) 251 b L(110) 252 253L(164): 254 subf r31, r30, r31 255 addi r10, r10, 1 256 b L(123) 257L(167): 258 subf r31, r30, r31 259 addi r10, r10, 1 260 b L(51) 261L(165): 262 subf r31, r30, r31 263 addi r8, r8, 1 264 b L(134) 265EPILOGUE() 266