1dnl PowerPC-64 mpn_mod_1_1p 2 3dnl Copyright 2010, 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C POWER3/PPC630 ? 24C POWER4/PPC970 17 25C POWER5 16 26C POWER6 30 27C POWER7 10.2 28 29C TODO 30C * Optimise, in particular the cps function. This was compiler-generated and 31C then hand optimised. 32 33C INPUT PARAMETERS 34define(`ap', `r3') 35define(`n', `r4') 36define(`d', `r5') 37define(`cps', `r6') 38 39ASM_START() 40 41EXTERN_FUNC(mpn_invert_limb) 42 43PROLOGUE(mpn_mod_1_1p) 44 sldi r10, r4, 3 45 addi r4, r4, -1 46 add r3, r3, r10 47 ld r0, 16(r6) C B1modb 48 ld r12, 24(r6) C B2modb 49 ld r9, -8(r3) 50 ld r10, -16(r3) 51 mtctr r4 52 mulhdu r8, r9, r0 53 mulld r7, r9, r0 54 addc r11, r7, r10 55 addze r9, r8 56 bdz L(end) 57 58 ALIGN(16) 59L(top): ld r4, -24(r3) 60 addi r3, r3, -8 61 nop 62 mulld r10, r11, r0 63 mulld r8, r9, r12 64 mulhdu r11, r11, r0 65 mulhdu r9, r9, r12 66 addc r7, r10, r4 67 addze r10, r11 68 addc r11, r8, r7 69 adde r9, r9, r10 70 bdnz L(top) 71 72L(end): lwz r0, 12(r6) 73 ld r3, 0(r6) 74 cmpdi cr7, r0, 0 75 beq- cr7, L(4) 76 subfic r10, r0, 64 77 sld r9, r9, r0 78 srd r10, r11, r10 79 or r9, r10, r9 80L(4): subfc r10, r5, r9 81 subfe r10, r10, r10 82 nand r10, r10, r10 83 sld r11, r11, r0 84 and r10, r10, r5 85 subf r9, r10, r9 86 mulhdu r10, r9, r3 87 mulld r3, r9, r3 88 addi r9, r9, 1 89 addc r8, r3, r11 90 adde r3, r10, r9 91 mulld r3, r3, r5 92 subf r3, r3, r11 93 cmpld cr7, r8, r3 94 bge cr7, L(5) C FIXME: Make branch-less 95 add r3, r3, r5 96L(5): cmpld cr7, r3, r5 97 bge- cr7, L(10) 98 srd r3, r3, r0 99 blr 100 101L(10): subf r3, r5, r3 102 srd r3, r3, r0 103 blr 104EPILOGUE() 105 106PROLOGUE(mpn_mod_1_1p_cps) 107 mflr r0 108 std r29, -24(r1) 109 std r30, -16(r1) 110 std r31, -8(r1) 111 cntlzd r31, r4 112 std r0, 16(r1) 113 extsw r31, r31 114 mr r29, r3 115 stdu r1, -144(r1) 116 sld r30, r4, r31 117 mr r3, r30 118 CALL( mpn_invert_limb) 119 nop 120 cmpdi cr7, r31, 0 121 neg r0, r30 122 beq- cr7, L(13) 123 subfic r11, r31, 64 124 li r0, 1 125 neg r9, r30 126 srd r11, r3, r11 127 sld r0, r0, r31 128 or r0, r11, r0 129 mulld r0, r0, r9 130L(13): mulhdu r9, r0, r3 131 mulld r11, r0, r3 132 add r9, r0, r9 133 nor r9, r9, r9 134 mulld r9, r9, r30 135 cmpld cr7, r11, r9 136 bge cr7, L(14) 137 add r9, r9, r30 138L(14): addi r1, r1, 144 139 srd r0, r0, r31 140 std r31, 8(r29) 141 std r3, 0(r29) 142 std r0, 16(r29) 143 ld r0, 16(r1) 144 srd r9, r9, r31 145 ld r30, -16(r1) 146 ld r31, -8(r1) 147 std r9, 24(r29) 148 ld r29, -24(r1) 149 mtlr r0 150 blr 151EPILOGUE() 152