1dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store 2dnl the result in a second limb vector. 3 4dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software 5dnl Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C POWER3/PPC630: 6-18 26C POWER4/PPC970: 7.25 27C POWER5: 7.75 28 29C TODO 30C * Try to reduce the number of needed live registers (at least r5 and r10 31C could be combined) 32C * Optimize feed-in code, for speed and size. 33C * Clean up r12/r7 usage in feed-in code. 34 35C INPUT PARAMETERS 36define(`rp', `r3') 37define(`up', `r4') 38define(`n', `r5') 39define(`vl', `r6') 40 41ASM_START() 42PROLOGUE(mpn_mul_1c) 43 std r27, -40(r1) 44 std r26, -48(r1) 45 mr r12, r7 46 b L(ent) 47EPILOGUE() 48PROLOGUE(mpn_mul_1) 49 std r27, -40(r1) 50 std r26, -48(r1) 51 li r12, 0 C cy_limb = 0 52L(ent): ld r26, 0(up) 53 54 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 55 cmpdi cr6, r0, 2 56 addic n, n, 3 C compute count... 57 srdi n, n, 2 C ...for ctr 58 mtctr n C copy count into ctr 59 beq cr0, L(b00) 60 blt cr6, L(b01) 61 beq cr6, L(b10) 62 63L(b11): mr r7, r12 64 mulld r0, r26, r6 65 mulhdu r12, r26, r6 66 addi up, up, 8 67 addc r0, r0, r7 68 std r0, 0(rp) 69 addi rp, rp, 8 70 b L(fic) 71 72L(b00): ld r27, 8(up) 73 addi up, up, 16 74 mulld r0, r26, r6 75 mulhdu r5, r26, r6 76 mulld r7, r27, r6 77 mulhdu r8, r27, r6 78 addc r0, r0, r12 79 adde r7, r7, r5 80 addze r12, r8 81 std r0, 0(rp) 82 std r7, 8(rp) 83 addi rp, rp, 16 84 b L(fic) 85 86 nop C alignment 87L(b01): bdnz L(gt1) 88 mulld r0, r26, r6 89 mulhdu r8, r26, r6 90 addc r0, r0, r12 91 std r0, 0(rp) 92 b L(ret) 93L(gt1): ld r27, 8(up) 94 nop 95 mulld r0, r26, r6 96 mulhdu r5, r26, r6 97 ld r26, 16(up) 98 mulld r7, r27, r6 99 mulhdu r8, r27, r6 100 mulld r9, r26, r6 101 mulhdu r10, r26, r6 102 addc r0, r0, r12 103 adde r7, r7, r5 104 adde r9, r9, r8 105 addze r12, r10 106 std r0, 0(rp) 107 std r7, 8(rp) 108 std r9, 16(rp) 109 addi up, up, 24 110 addi rp, rp, 24 111 b L(fic) 112 113 nop 114L(fic): ld r26, 0(up) 115L(b10): ld r27, 8(up) 116 addi up, up, 16 117 bdz L(end) 118 119L(top): mulld r0, r26, r6 120 mulhdu r5, r26, r6 121 ld r26, 0(up) 122 nop 123 124 mulld r7, r27, r6 125 mulhdu r8, r27, r6 126 ld r27, 8(up) 127 nop 128 129 adde r0, r0, r12 130 adde r7, r7, r5 131 132 mulld r9, r26, r6 133 mulhdu r10, r26, r6 134 ld r26, 16(up) 135 nop 136 137 mulld r11, r27, r6 138 mulhdu r12, r27, r6 139 ld r27, 24(up) 140 141 std r0, 0(rp) 142 adde r9, r9, r8 143 std r7, 8(rp) 144 adde r11, r11, r10 145 std r9, 16(rp) 146 addi up, up, 32 147 std r11, 24(rp) 148 149 addi rp, rp, 32 150 bdnz L(top) 151 152L(end): mulld r0, r26, r6 153 mulhdu r5, r26, r6 154 155 mulld r7, r27, r6 156 mulhdu r8, r27, r6 157 158 adde r0, r0, r12 159 adde r7, r7, r5 160 161 std r0, 0(rp) 162 std r7, 8(rp) 163L(ret): addze r3, r8 164 ld r27, -40(r1) 165 ld r26, -48(r1) 166 blr 167EPILOGUE() 168