1dnl PPC-32 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number. 2 3dnl Copyright 2007, 2008, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C norm frac 24C 7410 ~36.5 ~36.5 25C 744x, 745x 29 29 26 27C INPUT PARAMETERS 28C qp = r3 29C fn = r4 30C up = r5 31C un = r6 32C d = r7 33 34C TODO 35C * Decrease register usage. 36C * Make sure mul operands and optimal for early-out. 37C * Check that things work well for a shared library build. 38C * Write an invert_limb, perhaps inline, perhaps as a private call. Or at 39C least vastly improve the current __udiv_qrnnd_c based code. 40 41 42ASM_START() 43PROLOGUE(mpn_divrem_2) 44 stwu r1, -32(r1) 45 slwi r0, r6, 2 46 add r5, r5, r0 47 stmw r28, 8(r1) 48 addi r29, r5, -8 C up = up_param + un - 2 49 lwz r10, 4(r7) 50 lwz r12, 4(r29) 51 addi r8, r3, -12 52 lwz r7, 0(r7) 53 cmplw cr7, r12, r10 54 lwz r28, 0(r29) 55 blt- cr7, L(2) 56 bgt+ cr7, L(4) 57 cmplw cr7, r28, r7 58 blt- cr7, L(2) 59L(4): subfc r28, r7, r28 60 subfe r12, r10, r12 61 li r3, 1 62 b L(6) 63L(2): li r3, 0 64 65L(6): add r0, r4, r6 66 addic. r30, r0, -2 67 ble- cr0, L(ret) 68 69 slwi r9, r0, 2 70 add r8, r8, r9 C rp += un + fn 71 mtctr r30 72 73C Compute di from d1 74 srwi r11, r10, 16 75 nor r0, r10, r10 76 divwu r31, r0, r11 77 rlwinm r5, r10, 0, 16, 31 78 mullw r9, r11, r31 79 mullw r6, r5, r31 80 subf r0, r9, r0 81 slwi r0, r0, 16 82 ori r0, r0, 65535 83 cmplw cr7, r0, r6 84 bge- cr7, L(9) 85 add r0, r0, r10 86 cmplw cr7, r0, r10 87 cmplw cr6, r6, r0 88 addi r31, r31, -1 C q1-- 89 crorc 28, 28, 25 90 bc+ 12, 28, L(9) 91 addi r31, r31, -1 C q1-- 92 add r0, r0, r10 93L(9): subf r0, r6, r0 94 divwu r6, r0, r11 95 mullw r9, r11, r6 96 mullw r11, r5, r6 97 subf r0, r9, r0 98 slwi r0, r0, 16 99 ori r0, r0, 65535 100 cmplw cr7, r0, r11 101 bge- cr7, L(13) 102 add r0, r0, r10 103 cmplw cr7, r0, r10 104 cmplw cr6, r11, r0 105 addi r6, r6, -1 C q0-- 106 crorc 28, 28, 25 107 bc+ 12, 28, L(13) 108C add r0, r0, r10 C final remainder 109 addi r6, r6, -1 C q0-- 110L(13): rlwimi r6, r31, 16, 0, 15 C assemble final quotient 111 112C Adjust di by including d0 113 mullw r9, r10, r6 C t0 = LO(di * d1) 114 addc r11, r9, r7 115 subfe r0, r1, r1 116 mulhwu r9, r6, r7 C s1 = HI(di * d0) 117 addc r9, r11, r9 118 addze. r0, r0 119 blt cr0, L(17) 120L(18): subfc r9, r10, r9 121 addi r6, r6, -1 122 addme. r0, r0 123 bge+ cr0, L(18) 124L(17): 125 126C r0 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12 r28 r29 r30 r31 127C msl di d0 qp d1 fn up un 128L(loop): 129 mullw r0, r12, r6 C q0 = LO(n2 * di) 130 cmpw cr7, r30, r4 131 addc r31, r0, r28 C q0 += n1 132 mulhwu r9, r12, r6 C q = HI(n2 * di) 133 adde r12, r9, r12 C q += n2 134 addi r30, r30, -1 135 mullw r0, r10, r12 C d1 * q 136 li r9, 0 137 subf r0, r0, r28 C n1 -= d1 * q 138 addi r5, r12, 1 139 ble- cr7, L(23) 140 lwzu r9, -4(r29) 141L(23): mullw r11, r12, r7 C t0 = LO(d0 * q) 142 subfc r28, r7, r9 C n0 -= d0 143 subfe r0, r10, r0 C n1 -= d1 144 mulhwu r12, r12, r7 C t1 = HI(d0 * q) 145 subfc r28, r11, r28 C n0 -= t0 146 subfe r12, r12, r0 C n1 -= t1 147 cmplw cr7, r12, r31 148 blt+ cr7, L(24) 149 addc r28, r28, r7 150 adde r12, r12, r10 151 addi r5, r5, -1 152L(24): cmplw cr7, r12, r10 153 bge- cr7, L(fix) 154L(bck): stw r5, 0(r8) 155 addi r8, r8, -4 156 bdnz L(loop) 157 158L(ret): stw r28, 0(r29) 159 stw r12, 4(r29) 160 lmw r28, 8(r1) 161 addi r1, r1, 32 162 blr 163 164L(fix): cmplw cr6, r28, r7 165 bgt+ cr7, L(28) 166 blt- cr6, L(bck) 167L(28): subfc r28, r7, r28 168 subfe r12, r10, r12 169 addi r5, r5, 1 170 b L(bck) 171EPILOGUE() 172