1dnl PowerPC-64 mpn_mul_1 -- Multiply a limb vector with a limb and store 2dnl the result in a second limb vector. 3 4dnl Copyright 1999-2001, 2003-2006, 2010 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C POWER3/PPC630 6-18 36C POWER4/PPC970 7.25? not updated for last file revision 37C POWER5 7.25 38C POWER6 14 39C POWER7 2.9 40 41C TODO 42C * Try to reduce the number of needed live registers (at least r5 and r10 43C could be combined) 44C * Optimize feed-in code, for speed and size. 45C * Clean up r12/r7 usage in feed-in code. 46 47C INPUT PARAMETERS 48define(`rp', `r3') 49define(`up', `r4') 50define(`n', `r5') 51define(`vl', `r6') 52 53ASM_START() 54PROLOGUE(mpn_mul_1c) 55 std r27, -40(r1) 56 std r26, -48(r1) 57 mr r12, r7 58 b L(ent) 59EPILOGUE() 60PROLOGUE(mpn_mul_1) 61 std r27, -40(r1) 62 std r26, -48(r1) 63 li r12, 0 C cy_limb = 0 64L(ent): ld r26, 0(up) 65 66 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 67 cmpdi cr6, r0, 2 68 addic n, n, 3 C compute count... 69 srdi n, n, 2 C ...for ctr 70 mtctr n C copy count into ctr 71 beq cr0, L(b00) 72 blt cr6, L(b01) 73 beq cr6, L(b10) 74 75L(b11): mr r7, r12 76 mulld r0, r26, r6 77 mulhdu r12, r26, r6 78 addi up, up, 8 79 addc r0, r0, r7 80 std r0, 0(rp) 81 addi rp, rp, 8 82 b L(fic) 83 84L(b00): ld r27, 8(up) 85 addi up, up, 16 86 mulld r0, r26, r6 87 mulhdu r5, r26, r6 88 mulld r7, r27, r6 89 mulhdu r8, r27, r6 90 addc r0, r0, r12 91 adde r7, r7, r5 92 addze r12, r8 93 std r0, 0(rp) 94 std r7, 8(rp) 95 addi rp, rp, 16 96 b L(fic) 97 98 nop C alignment 99L(b01): bdnz L(gt1) 100 mulld r0, r26, r6 101 mulhdu r8, r26, r6 102 addc r0, r0, r12 103 std r0, 0(rp) 104 b L(ret) 105L(gt1): ld r27, 8(up) 106 nop 107 mulld r0, r26, r6 108 mulhdu r5, r26, r6 109 ld r26, 16(up) 110 mulld r7, r27, r6 111 mulhdu r8, r27, r6 112 mulld r9, r26, r6 113 mulhdu r10, r26, r6 114 addc r0, r0, r12 115 adde r7, r7, r5 116 adde r9, r9, r8 117 addze r12, r10 118 std r0, 0(rp) 119 std r7, 8(rp) 120 std r9, 16(rp) 121 addi up, up, 24 122 addi rp, rp, 24 123 b L(fic) 124 125 nop 126L(fic): ld r26, 0(up) 127L(b10): ld r27, 8(up) 128 addi up, up, 16 129 bdz L(end) 130 131L(top): mulld r0, r26, r6 132 mulhdu r5, r26, r6 133 mulld r7, r27, r6 134 mulhdu r8, r27, r6 135 ld r26, 0(up) 136 ld r27, 8(up) 137 adde r0, r0, r12 138 adde r7, r7, r5 139 mulld r9, r26, r6 140 mulhdu r10, r26, r6 141 mulld r11, r27, r6 142 mulhdu r12, r27, r6 143 ld r26, 16(up) 144 ld r27, 24(up) 145 std r0, 0(rp) 146 adde r9, r9, r8 147 std r7, 8(rp) 148 adde r11, r11, r10 149 std r9, 16(rp) 150 addi up, up, 32 151 std r11, 24(rp) 152 153 addi rp, rp, 32 154 bdnz L(top) 155 156L(end): mulld r0, r26, r6 157 mulhdu r5, r26, r6 158 mulld r7, r27, r6 159 mulhdu r8, r27, r6 160 adde r0, r0, r12 161 adde r7, r7, r5 162 std r0, 0(rp) 163 std r7, 8(rp) 164L(ret): addze r3, r8 165 ld r27, -40(r1) 166 ld r26, -48(r1) 167 blr 168EPILOGUE() 169