1dnl PowerPC-64 mpn_divrem_1 -- Divide an mpn number by an unnormalized limb. 2 3dnl Copyright 2003-2005, 2007, 2008, 2010, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C norm unorm frac 35C POWER3/PPC630 16-34 16-34 ~11 outdated figures 36C POWER4/PPC970 28 28 19 37C POWER5 29 29 ~19 38C POWER6 49 59 ~42 39C POWER7 24.5 23 ~14 40 41C INPUT PARAMETERS 42C qp = r3 43C fn = r4 44C up = r5 45C un = r6 46C d = r7 47 48C We use a not very predictable branch in the frac code, therefore the cycle 49C count wobbles somewhat. With the alternative branch-free code, things run 50C considerably slower on POWER4/PPC970 and POWER5. 51 52C Add preinv entry point. 53 54 55ASM_START() 56 57EXTERN_FUNC(mpn_invert_limb) 58 59PROLOGUE(mpn_divrem_1,toc) 60 61 mfcr r12 62 add. r10, r6, r4 63 std r25, -56(r1) 64 mr r25, r4 65 mflr r0 66 std r26, -48(r1) 67 mr r26, r5 68 std r28, -32(r1) 69 mr r28, r6 70 std r29, -24(r1) 71 mr r29, r3 72 li r3, 0 73 std r30, -16(r1) 74 mr r30, r7 75 std r31, -8(r1) 76 li r31, 0 77 std r27, -40(r1) 78 std r0, 16(r1) 79 stw r12, 8(r1) 80 stdu r1, -176(r1) 81 beq- cr0, L(1) 82 cmpdi cr7, r7, 0 83 sldi r0, r10, 3 84 add r11, r0, r29 85 addi r29, r11, -8 86 blt- cr7, L(162) 87 cmpdi cr4, r6, 0 88 beq+ cr4, L(71) 89L(163): 90 sldi r9, r6, 3 91 add r9, r9, r5 92 ld r7, -8(r9) 93 cmpld cr7, r7, r30 94 bge- cr7, L(71) 95 cmpdi cr7, r10, 1 96 li r0, 0 97 mr r31, r7 98 std r0, -8(r11) 99 addi r29, r29, -8 100 mr r3, r7 101 beq- cr7, L(1) 102 addi r28, r6, -1 103 cmpdi cr4, r28, 0 104L(71): 105 cntlzd r27, r30 106 sld r30, r30, r27 107 sld r31, r31, r27 108 mr r3, r30 109 CALL( mpn_invert_limb) 110 beq- cr4, L(110) 111 sldi r9, r28, 3 112 addic. r6, r28, -2 113 add r9, r9, r26 114 subfic r5, r27, 64 115 ld r8, -8(r9) 116 srd r0, r8, r5 117 or r31, r31, r0 118 sld r7, r8, r27 119 blt- cr0, L(154) 120 addi r28, r28, -1 121 mtctr r28 122 sldi r6, r6, 3 123 ALIGN(16) 124L(uloop): 125 ldx r8, r26, r6 126 nop 127 mulld r0, r31, r3 128 mulhdu r10, r31, r3 129 addi r11, r31, 1 130 srd r9, r8, r5 131 addi r6, r6, -8 132 or r9, r7, r9 133 addc r0, r0, r9 134 adde r10, r10, r11 135 mulld r31, r10, r30 136 subf r31, r31, r9 137 subfc r0, r31, r0 C r <= ql 138 subfe r0, r0, r0 C r0 = -(r <= ql) 139 and r9, r30, r0 140 add r31, r31, r9 141 add r10, r0, r10 C qh -= (r >= ql) 142 cmpld cr7, r31, r30 143 bge- cr7, L(164) 144L(123): 145 std r10, 0(r29) 146 addi r29, r29, -8 147 sld r7, r8, r27 148 bdnz L(uloop) 149L(154): 150 addi r11, r31, 1 151 nop 152 mulld r0, r31, r3 153 mulhdu r8, r31, r3 154 addc r0, r0, r7 155 adde r8, r8, r11 156 mulld r31, r8, r30 157 subf r31, r31, r7 158 subfc r0, r0, r31 C r >= ql 159 subfe r0, r0, r0 C r0 = -(r >= ql) 160 not r7, r0 161 add r8, r7, r8 C qh -= (r >= ql) 162 andc r0, r30, r0 163 add r31, r31, r0 164 cmpld cr7, r31, r30 165 bge- cr7, L(165) 166L(134): 167 std r8, 0(r29) 168 addi r29, r29, -8 169L(110): 170 addic. r0, r25, -1 171 blt- cr0, L(156) 172 mtctr r25 173 neg r9, r30 174 ALIGN(16) 175L(ufloop): 176 addi r11, r31, 1 177 nop 178 mulld r0, r3, r31 179 mulhdu r10, r3, r31 180 add r10, r10, r11 181 mulld r31, r9, r10 182ifelse(0,1,` 183 subfc r0, r0, r31 184 subfe r0, r0, r0 C r0 = -(r >= ql) 185 not r7, r0 186 add r10, r7, r10 C qh -= (r >= ql) 187 andc r0, r30, r0 188 add r31, r31, r0 189',` 190 cmpld cr7, r31, r0 191 blt cr7, L(29) 192 add r31, r30, r31 193 addi r10, r10, -1 194L(29): 195') 196 std r10, 0(r29) 197 addi r29, r29, -8 198 bdnz L(ufloop) 199L(156): 200 srd r3, r31, r27 201L(1): 202 addi r1, r1, 176 203 ld r0, 16(r1) 204 lwz r12, 8(r1) 205 mtlr r0 206 ld r25, -56(r1) 207 ld r26, -48(r1) 208 mtcrf 8, r12 209 ld r27, -40(r1) 210 ld r28, -32(r1) 211 ld r29, -24(r1) 212 ld r30, -16(r1) 213 ld r31, -8(r1) 214 blr 215L(162): 216 cmpdi cr7, r6, 0 217 beq- cr7, L(8) 218 sldi r9, r6, 3 219 addi r29, r29, -8 220 add r9, r9, r5 221 addi r28, r6, -1 222 ld r31, -8(r9) 223 subfc r9, r7, r31 224 li r9, 0 225 adde r9, r9, r9 226 neg r0, r9 227 std r9, -8(r11) 228 and r0, r0, r7 229 subf r31, r0, r31 230L(8): 231 mr r3, r30 232 CALL( mpn_invert_limb) 233 li r27, 0 234 addic. r6, r28, -1 235 blt- cr0, L(110) 236 mtctr r28 237 sldi r6, r6, 3 238 ALIGN(16) 239L(nloop): 240 addi r11, r31, 1 241 ldx r8, r26, r6 242 mulld r0, r31, r3 243 mulhdu r10, r31, r3 244 addi r6, r6, -8 245 addc r0, r0, r8 246 adde r10, r10, r11 247 mulld r31, r10, r30 248 subf r31, r31, r8 C r = nl - qh * d 249 subfc r0, r31, r0 C r <= ql 250 subfe r0, r0, r0 C r0 = -(r <= ql) 251 and r9, r30, r0 252 add r31, r31, r9 253 add r10, r0, r10 C qh -= (r >= ql) 254 cmpld cr7, r31, r30 255 bge- cr7, L(167) 256L(51): 257 std r10, 0(r29) 258 addi r29, r29, -8 259 bdnz L(nloop) 260 b L(110) 261 262L(164): 263 subf r31, r30, r31 264 addi r10, r10, 1 265 b L(123) 266L(167): 267 subf r31, r30, r31 268 addi r10, r10, 1 269 b L(51) 270L(165): 271 subf r31, r30, r31 272 addi r8, r8, 1 273 b L(134) 274EPILOGUE() 275