1dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and 2dnl store sum in a third limb vector. 3 4dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C EV4: ? 36C EV5: 5.4 37C EV6: 2.125 38 39C INPUT PARAMETERS 40C rp r16 41C up r17 42C vp r18 43C n r19 44C cy r20 (for mpn_add_nc) 45 46C TODO 47C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) 48C Use multi-pronged feed-in. 49C Perform additional micro-tuning 50 51C This code was written in cooperation with ev6 pipeline expert Steve Root. 52 53C Pair loads and stores where possible 54C Store pairs oct-aligned where possible (didn't need it here) 55C Stores are delayed every third cycle 56C Loads and stores are delayed by fills 57C U stays still, put code there where possible (note alternation of U1 and U0) 58C L moves because of loads and stores 59C Note dampers in L to limit damage 60 61C This odd-looking optimization expects that were having random bits in our 62C data, so that a pure zero result is unlikely. so we penalize the unlikely 63C case to help the common case. 64 65define(`u0', `r0') define(`u1', `r3') 66define(`v0', `r1') define(`v1', `r4') 67 68define(`cy0', `r20') define(`cy1', `r21') 69 70MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc) 71 72ASM_START() 73PROLOGUE(mpn_add_nc) 74 br r31, $entry 75EPILOGUE() 76PROLOGUE(mpn_add_n) 77 bis r31, r31, cy0 C clear carry in 78$entry: cmpult r19, 5, r22 C L1 move counter 79 ldq u1, 0(r17) C L0 get next ones 80 ldq v1, 0(r18) C L1 81 bne r22, $Lsmall 82 83 ldq u0, 8(r17) C L0 get next ones 84 ldq v0, 8(r18) C L1 85 addq u1, v1, r5 C U0 add two data 86 87 cmpult r5, v1, r23 C U0 did it carry 88 ldq u1, 16(r17) C L0 get next ones 89 ldq v1, 16(r18) C L1 90 91 addq u0, v0, r8 C U1 add two data 92 addq r5, cy0, r5 C U0 carry in 93 94 cmpult r8, v0, r22 C U1 did it carry 95 beq r5, $fix5f C U0 fix exact zero 96$ret5f: ldq u0, 24(r17) C L0 get next ones 97 ldq v0, 24(r18) C L1 98 99 addq r8, r23, r8 C U1 carry from last 100 addq u1, v1, r7 C U0 add two data 101 102 beq r8, $fix6f C U1 fix exact zero 103$ret6f: cmpult r7, v1, r23 C U0 did it carry 104 ldq u1, 32(r17) C L0 get next ones 105 ldq v1, 32(r18) C L1 106 107 lda r17, 40(r17) C L0 move pointer 108 lda r18, 40(r18) C L1 move pointer 109 110 lda r16, -8(r16) 111 lda r19, -13(r19) C L1 move counter 112 blt r19, $Lend C U1 loop control 113 114 115C Main loop. 8-way unrolled. 116 ALIGN(16) 117$Loop: addq u0, v0, r2 C U1 add two data 118 addq r7, r22, r7 C U0 add in carry 119 stq r5, 8(r16) C L0 put an answer 120 stq r8, 16(r16) C L1 pair 121 122 cmpult r2, v0, cy1 C U1 did it carry 123 beq r7, $fix7 C U0 fix exact 0 124$ret7: ldq u0, 0(r17) C L0 get next ones 125 ldq v0, 0(r18) C L1 126 127 bis r31, r31, r31 C L damp out 128 addq r2, r23, r2 C U1 carry from last 129 bis r31, r31, r31 C L moves in L ! 130 addq u1, v1, r5 C U0 add two data 131 132 beq r2, $fix0 C U1 fix exact zero 133$ret0: cmpult r5, v1, cy0 C U0 did it carry 134 ldq u1, 8(r17) C L0 get next ones 135 ldq v1, 8(r18) C L1 136 137 addq u0, v0, r8 C U1 add two data 138 addq r5, cy1, r5 C U0 carry from last 139 stq r7, 24(r16) C L0 store pair 140 stq r2, 32(r16) C L1 141 142 cmpult r8, v0, r22 C U1 did it carry 143 beq r5, $fix1 C U0 fix exact zero 144$ret1: ldq u0, 16(r17) C L0 get next ones 145 ldq v0, 16(r18) C L1 146 147 lda r16, 64(r16) C L0 move pointer 148 addq r8, cy0, r8 C U1 carry from last 149 lda r19, -8(r19) C L1 move counter 150 addq u1, v1, r7 C U0 add two data 151 152 beq r8, $fix2 C U1 fix exact zero 153$ret2: cmpult r7, v1, r23 C U0 did it carry 154 ldq u1, 24(r17) C L0 get next ones 155 ldq v1, 24(r18) C L1 156 157 addq u0, v0, r2 C U1 add two data 158 addq r7, r22, r7 C U0 add in carry 159 stq r5, -24(r16) C L0 put an answer 160 stq r8, -16(r16) C L1 pair 161 162 cmpult r2, v0, cy1 C U1 did it carry 163 beq r7, $fix3 C U0 fix exact 0 164$ret3: ldq u0, 32(r17) C L0 get next ones 165 ldq v0, 32(r18) C L1 166 167 bis r31, r31, r31 C L damp out 168 addq r2, r23, r2 C U1 carry from last 169 bis r31, r31, r31 C L moves in L ! 170 addq u1, v1, r5 C U0 add two data 171 172 beq r2, $fix4 C U1 fix exact zero 173$ret4: cmpult r5, v1, cy0 C U0 did it carry 174 ldq u1, 40(r17) C L0 get next ones 175 ldq v1, 40(r18) C L1 176 177 addq u0, v0, r8 C U1 add two data 178 addq r5, cy1, r5 C U0 carry from last 179 stq r7, -8(r16) C L0 store pair 180 stq r2, 0(r16) C L1 181 182 cmpult r8, v0, r22 C U1 did it carry 183 beq r5, $fix5 C U0 fix exact zero 184$ret5: ldq u0, 48(r17) C L0 get next ones 185 ldq v0, 48(r18) C L1 186 187 ldl r31, 256(r17) C L0 prefetch 188 addq r8, cy0, r8 C U1 carry from last 189 ldl r31, 256(r18) C L1 prefetch 190 addq u1, v1, r7 C U0 add two data 191 192 beq r8, $fix6 C U1 fix exact zero 193$ret6: cmpult r7, v1, r23 C U0 did it carry 194 ldq u1, 56(r17) C L0 get next ones 195 ldq v1, 56(r18) C L1 196 197 lda r17, 64(r17) C L0 move pointer 198 bis r31, r31, r31 C U 199 lda r18, 64(r18) C L1 move pointer 200 bge r19, $Loop C U1 loop control 201C ==== main loop end 202 203$Lend: addq u0, v0, r2 C U1 add two data 204 addq r7, r22, r7 C U0 add in carry 205 stq r5, 8(r16) C L0 put an answer 206 stq r8, 16(r16) C L1 pair 207 cmpult r2, v0, cy1 C U1 did it carry 208 beq r7, $fix7c C U0 fix exact 0 209$ret7c: addq r2, r23, r2 C U1 carry from last 210 addq u1, v1, r5 C U0 add two data 211 beq r2, $fix0c C U1 fix exact zero 212$ret0c: cmpult r5, v1, cy0 C U0 did it carry 213 addq r5, cy1, r5 C U0 carry from last 214 stq r7, 24(r16) C L0 store pair 215 stq r2, 32(r16) C L1 216 beq r5, $fix1c C U0 fix exact zero 217$ret1c: stq r5, 40(r16) C L0 put an answer 218 lda r16, 48(r16) C L0 move pointer 219 220 lda r19, 8(r19) 221 beq r19, $Lret 222 223 ldq u1, 0(r17) 224 ldq v1, 0(r18) 225$Lsmall: 226 lda r19, -1(r19) 227 beq r19, $Lend0 228 229 ALIGN(8) 230$Loop0: addq u1, v1, r2 C main add 231 cmpult r2, v1, r8 C compute cy from last add 232 ldq u1, 8(r17) 233 ldq v1, 8(r18) 234 addq r2, cy0, r5 C carry add 235 lda r17, 8(r17) 236 lda r18, 8(r18) 237 stq r5, 0(r16) 238 cmpult r5, r2, cy0 C compute cy from last add 239 lda r19, -1(r19) C decr loop cnt 240 bis r8, cy0, cy0 C combine cy from the two adds 241 lda r16, 8(r16) 242 bne r19, $Loop0 243$Lend0: addq u1, v1, r2 C main add 244 addq r2, cy0, r5 C carry add 245 cmpult r2, v1, r8 C compute cy from last add 246 cmpult r5, r2, cy0 C compute cy from last add 247 stq r5, 0(r16) 248 bis r8, cy0, r0 C combine cy from the two adds 249 ret r31,(r26),1 250 251 ALIGN(8) 252$Lret: lda r0, 0(cy0) C copy carry into return register 253 ret r31,(r26),1 254 255$fix5f: bis r23, cy0, r23 C bring forward carry 256 br r31, $ret5f 257$fix6f: bis r22, r23, r22 C bring forward carry 258 br r31, $ret6f 259$fix0: bis cy1, r23, cy1 C bring forward carry 260 br r31, $ret0 261$fix1: bis cy0, cy1, cy0 C bring forward carry 262 br r31, $ret1 263$fix2: bis r22, cy0, r22 C bring forward carry 264 br r31, $ret2 265$fix3: bis r23, r22, r23 C bring forward carry 266 br r31, $ret3 267$fix4: bis cy1, r23, cy1 C bring forward carry 268 br r31, $ret4 269$fix5: bis cy1, cy0, cy0 C bring forward carry 270 br r31, $ret5 271$fix6: bis r22, cy0, r22 C bring forward carry 272 br r31, $ret6 273$fix7: bis r23, r22, r23 C bring forward carry 274 br r31, $ret7 275$fix0c: bis cy1, r23, cy1 C bring forward carry 276 br r31, $ret0c 277$fix1c: bis cy0, cy1, cy0 C bring forward carry 278 br r31, $ret1c 279$fix7c: bis r23, r22, r23 C bring forward carry 280 br r31, $ret7c 281 282EPILOGUE() 283ASM_END() 284