1dnl PowerPC-64 mpn_mod_1s_4p 2 3dnl Copyright 2010, 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C POWER3/PPC630 ? 24C POWER4/PPC970 9 25C POWER5 9 26C POWER6 13 27C POWER7 3.5 28 29C TODO 30C * Optimise, in particular the cps function. This was compiler-generated and 31C then hand optimised. 32 33C INPUT PARAMETERS 34define(`ap', `r3') 35define(`n', `r4') 36define(`d', `r5') 37define(`cps', `r6') 38 39ASM_START() 40 41EXTERN_FUNC(mpn_invert_limb) 42 43PROLOGUE(mpn_mod_1s_4p) 44 std r23, -72(r1) 45 ld r23, 48(cps) 46 std r24, -64(r1) 47 std r25, -56(r1) 48 ld r24, 32(cps) 49 ld r25, 24(cps) 50 std r26, -48(r1) 51 std r27, -40(r1) 52 ld r26, 16(cps) 53 std r28, -32(r1) 54 std r29, -24(r1) 55 std r30, -16(r1) 56 std r31, -8(r1) 57 ld r30, 40(cps) 58 59 rldicl. r0, n, 0,62 60 sldi r31, n, 3 61 add ap, ap, r31 C make ap point at end of operand 62 63 cmpdi cr7, r0, 2 64 beq cr0, L(b00) 65 blt cr7, L(b01) 66 beq cr7, L(b10) 67 68L(b11): ld r11, -16(ap) 69 ld r9, -8(ap) 70 ld r0, -24(ap) 71 mulhdu r27, r11, r26 72 mulld r8, r11, r26 73 mulhdu r11, r9, r25 74 mulld r9, r9, r25 75 addc r31, r8, r0 76 addze r10, r27 77 addc r0, r9, r31 78 adde r9, r11, r10 79 addi ap, ap, -40 80 b L(6) 81 82 ALIGN(16) 83L(b00): ld r11, -24(ap) 84 ld r10, -16(ap) 85 ld r9, -8(ap) 86 ld r0, -32(ap) 87 mulld r8, r11, r26 88 mulhdu r7, r10, r25 89 mulhdu r27, r11, r26 90 mulhdu r11, r9, r24 91 mulld r10, r10, r25 92 mulld r9, r9, r24 93 addc r31, r8, r0 94 addze r0, r27 95 addc r8, r31, r10 96 adde r10, r0, r7 97 addc r0, r9, r8 98 adde r9, r11, r10 99 addi ap, ap, -48 100 b L(6) 101 102 ALIGN(16) 103L(b01): li r9, 0 104 ld r0, -8(ap) 105 addi ap, ap, -24 106 b L(6) 107 108 ALIGN(16) 109L(b10): ld r9, -8(ap) 110 ld r0, -16(ap) 111 addi ap, ap, -32 112 113 ALIGN(16) 114L(6): addi r10, n, 3 115 srdi r7, r10, 2 116 mtctr r7 117 bdz L(end) 118 119 ALIGN(16) 120L(top): ld r31, -16(ap) 121 ld r10, -8(ap) 122 ld r11, 8(ap) 123 ld r12, 0(ap) 124 mulld r29, r0, r30 C rl * B4modb 125 mulhdu r0, r0, r30 C rl * B4modb 126 mulhdu r27, r10, r26 127 mulld r10, r10, r26 128 mulhdu r7, r9, r23 C rh * B5modb 129 mulld r9, r9, r23 C rh * B5modb 130 mulhdu r28, r11, r24 131 mulld r11, r11, r24 132 mulhdu r4, r12, r25 133 mulld r12, r12, r25 134 addc r8, r10, r31 135 addze r10, r27 136 addi ap, ap, -32 137 addc r27, r8, r12 138 adde r12, r10, r4 139 addc r11, r27, r11 140 adde r31, r12, r28 141 addc r12, r11, r29 142 adde r4, r31, r0 143 addc r0, r9, r12 144 adde r9, r7, r4 145 bdnz L(top) 146 147L(end): lwz r3, 12(cps) 148 mulld r10, r9, r26 149 mulhdu r9, r9, r26 150 addc r11, r0, r10 151 addze r9, r9 152 ld r10, 0(cps) 153 subfic r8, r3, 64 154 sld r9, r9, r3 155 srd r8, r11, r8 156 sld r11, r11, r3 157 or r9, r8, r9 158 mulld r0, r9, r10 159 mulhdu r10, r9, r10 160 addi r9, r9, 1 161 addc r8, r0, r11 162 adde r0, r10, r9 163 mulld r0, r0, d 164 subf r0, r0, r11 165 cmpld cr7, r8, r0 166 bge cr7, L(9) 167 add r0, r0, d 168L(9): cmpld cr7, r0, d 169 bge- cr7, L(16) 170L(10): srd r3, r0, r3 171 ld r23, -72(r1) 172 ld r24, -64(r1) 173 ld r25, -56(r1) 174 ld r26, -48(r1) 175 ld r27, -40(r1) 176 ld r28, -32(r1) 177 ld r29, -24(r1) 178 ld r30, -16(r1) 179 ld r31, -8(r1) 180 blr 181 182L(16): subf r0, d, r0 183 b L(10) 184EPILOGUE() 185 186PROLOGUE(mpn_mod_1s_4p_cps) 187 mflr r0 188 std r29, -24(r1) 189 std r30, -16(r1) 190 mr r29, r3 191 std r0, 16(r1) 192 std r31, -8(r1) 193 stdu r1, -144(r1) 194 cntlzd r31, r4 195 sld r30, r4, r31 196 mr r3, r30 197 CALL( mpn_invert_limb) 198 nop 199 subfic r9, r31, 64 200 li r10, 1 201 sld r10, r10, r31 202 srd r9, r3, r9 203 neg r0, r30 204 or r10, r10, r9 205 mulld r10, r10, r0 206 mulhdu r11, r10, r3 207 nor r11, r11, r11 208 subf r11, r10, r11 209 mulld r11, r11, r30 210 mulld r0, r10, r3 211 cmpld cr7, r0, r11 212 bge cr7, L(18) 213 add r11, r11, r30 214L(18): mulhdu r9, r11, r3 215 add r9, r11, r9 216 nor r9, r9, r9 217 mulld r9, r9, r30 218 mulld r0, r11, r3 219 cmpld cr7, r0, r9 220 bge cr7, L(19) 221 add r9, r9, r30 222L(19): mulhdu r0, r9, r3 223 add r0, r9, r0 224 nor r0, r0, r0 225 mulld r0, r0, r30 226 mulld r8, r9, r3 227 cmpld cr7, r8, r0 228 bge cr7, L(20) 229 add r0, r0, r30 230L(20): mulhdu r8, r0, r3 231 add r8, r0, r8 232 nor r8, r8, r8 233 mulld r8, r8, r30 234 mulld r7, r0, r3 235 cmpld cr7, r7, r8 236 bge cr7, L(21) 237 add r8, r8, r30 238L(21): srd r0, r0, r31 239 addi r1, r1, 144 240 srd r8, r8, r31 241 srd r10, r10, r31 242 srd r11, r11, r31 243 std r0, 40(r29) 244 std r31, 8(r29) 245 srd r9, r9, r31 246 ld r0, 16(r1) 247 ld r30, -16(r1) 248 std r8, 48(r29) 249 std r3, 0(r29) 250 mtlr r0 251 ld r31, -8(r1) 252 std r10, 16(r29) 253 std r11, 24(r29) 254 std r9, 32(r29) 255 ld r29, -24(r1) 256 blr 257EPILOGUE() 258