1dnl PowerPC-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction. 2 3dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2007 Free Software 4dnl Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C POWER3/PPC630: 1.5 25C POWER4/PPC970: 2 26 27C n POWER3/PPC630 POWER4/PPC970 28C 1 17.00 19.00 29C 2 9.00 10.49 30C 3 5.33 7.66 31C 4 4.50 5.14 32C 5 4.20 4.80 33C 6 3.83 4.33 34C 7 3.00 3.99 35C 8 2.87 3.55 36C 9 2.89 3.40 37C 10 2.60 3.42 38C 11 2.45 3.15 39C 12 2.41 2.99 40C 13 2.46 3.01 41C 14 2.42 2.97 42C 15 2.20 2.85 43C 50 1.78 2.44 44C 100 1.83 2.20 45C 200 1.55 2.12 46C 400 1.53 2.05 47C 1000 1.98 2.02# 48C 2000 1.50# 2.04 49C 4000 2.55 2.50 50C 8000 2.70 2.45 51C 16000 2.65 5.94 52C 32000 2.62 16.41 53C 64000 2.73 18.94 54 55C This code is a little bit slower for POWER3/PPC630 than the simple code used 56C previously, but it is much faster for POWER4/PPC970. The reason for the 57C POWER3/PPC630 slowdown can be attributed to the saving and restoring of 4 58C registers. 59 60C INPUT PARAMETERS 61C rp r3 62C up r4 63C vp r5 64C n r6 65 66ifdef(`OPERATION_add_n',` 67 define(ADDSUBC, adde) 68 define(ADDSUB, addc) 69 define(func, mpn_add_n) 70 define(func_nc, mpn_add_nc) 71 define(GENRVAL, `addi r3, r3, 1') 72 define(SETCBR, `addic r0, $1, -1') 73 define(CLRCB, `addic r0, r0, 0') 74') 75ifdef(`OPERATION_sub_n',` 76 define(ADDSUBC, subfe) 77 define(ADDSUB, subfc) 78 define(func, mpn_sub_n) 79 define(func_nc, mpn_sub_nc) 80 define(GENRVAL, `neg r3, r3') 81 define(SETCBR, `subfic r0, $1, 0') 82 define(CLRCB, `addic r0, r1, -1') 83') 84 85MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc) 86 87ASM_START() 88PROLOGUE(func_nc) 89 SETCBR(r7) 90 b L(ent) 91EPILOGUE() 92 93PROLOGUE(func) 94 CLRCB 95L(ent): std r31, -8(r1) 96 std r30, -16(r1) 97 std r29, -24(r1) 98 std r28, -32(r1) 99 100 rldicl. r0, r6, 0,62 C r0 = n & 3, set cr0 101 cmpdi cr6, r0, 2 102 addi r6, r6, 3 C compute count... 103 srdi r6, r6, 2 C ...for ctr 104 mtctr r6 C copy count into ctr 105 beq cr0, L(b00) 106 blt cr6, L(b01) 107 beq cr6, L(b10) 108 109L(b11): ld r8, 0(r4) C load s1 limb 110 ld r9, 0(r5) C load s2 limb 111 ld r10, 8(r4) C load s1 limb 112 ld r11, 8(r5) C load s2 limb 113 ld r12, 16(r4) C load s1 limb 114 addi r4, r4, 24 115 ld r0, 16(r5) C load s2 limb 116 addi r5, r5, 24 117 ADDSUBC r29, r9, r8 118 ADDSUBC r30, r11, r10 119 ADDSUBC r31, r0, r12 120 std r29, 0(r3) 121 std r30, 8(r3) 122 std r31, 16(r3) 123 addi r3, r3, 24 124 bdnz L(go) 125 b L(ret) 126 127L(b01): ld r12, 0(r4) C load s1 limb 128 addi r4, r4, 8 129 ld r0, 0(r5) C load s2 limb 130 addi r5, r5, 8 131 ADDSUBC r31, r0, r12 C add 132 std r31, 0(r3) 133 addi r3, r3, 8 134 bdnz L(go) 135 b L(ret) 136 137L(b10): ld r10, 0(r4) C load s1 limb 138 ld r11, 0(r5) C load s2 limb 139 ld r12, 8(r4) C load s1 limb 140 addi r4, r4, 16 141 ld r0, 8(r5) C load s2 limb 142 addi r5, r5, 16 143 ADDSUBC r30, r11, r10 C add 144 ADDSUBC r31, r0, r12 C add 145 std r30, 0(r3) 146 std r31, 8(r3) 147 addi r3, r3, 16 148 bdnz L(go) 149 b L(ret) 150 151L(b00): C INITCY C clear/set cy 152L(go): ld r6, 0(r4) C load s1 limb 153 ld r7, 0(r5) C load s2 limb 154 ld r8, 8(r4) C load s1 limb 155 ld r9, 8(r5) C load s2 limb 156 ld r10, 16(r4) C load s1 limb 157 ld r11, 16(r5) C load s2 limb 158 ld r12, 24(r4) C load s1 limb 159 ld r0, 24(r5) C load s2 limb 160 bdz L(end) 161 162 addi r4, r4, 32 163 addi r5, r5, 32 164 165L(oop): ADDSUBC r28, r7, r6 166 ld r6, 0(r4) C load s1 limb 167 ld r7, 0(r5) C load s2 limb 168 ADDSUBC r29, r9, r8 169 ld r8, 8(r4) C load s1 limb 170 ld r9, 8(r5) C load s2 limb 171 ADDSUBC r30, r11, r10 172 ld r10, 16(r4) C load s1 limb 173 ld r11, 16(r5) C load s2 limb 174 ADDSUBC r31, r0, r12 175 ld r12, 24(r4) C load s1 limb 176 ld r0, 24(r5) C load s2 limb 177 std r28, 0(r3) 178 addi r4, r4, 32 179 std r29, 8(r3) 180 addi r5, r5, 32 181 std r30, 16(r3) 182 std r31, 24(r3) 183 addi r3, r3, 32 184 bdnz L(oop) C decrement ctr and loop back 185 186L(end): ADDSUBC r28, r7, r6 187 ADDSUBC r29, r9, r8 188 ADDSUBC r30, r11, r10 189 ADDSUBC r31, r0, r12 190 std r28, 0(r3) 191 std r29, 8(r3) 192 std r30, 16(r3) 193 std r31, 24(r3) 194 195L(ret): ld r31, -8(r1) 196 ld r30, -16(r1) 197 ld r29, -24(r1) 198 ld r28, -32(r1) 199 200 subfe r3, r0, r0 C -cy 201 GENRVAL 202 blr 203EPILOGUE() 204