1dnl Alpha mpn_mod_1s_4p 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2009, 2010 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C TODO: 25C * Optimise. 2.75 c/l should be possible. 26C * Write a proper mpn_mod_1s_4p_cps. The code below was compiler generated. 27C * Optimise feed-in code, starting the sw pipeline in switch code. 28C * Shorten software pipeline. The mul instructions are scheduled too far 29C from their users. Fixing this will allow us to use fewer registers. 30C * If we cannot reduce register usage, write perhaps small-n basecase. 31C * Does this work for PIC? 32 33C cycles/limb 34C EV4: ? 35C EV5: 23 36C EV6: 3 37 38define(`ap', `r16') 39define(`n', `r17') 40define(`pl', `r24') 41define(`ph', `r25') 42define(`rl', `r6') 43define(`rh', `r7') 44define(`B1modb', `r1') 45define(`B2modb', `r2') 46define(`B3modb', `r3') 47define(`B4modb', `r4') 48define(`B5modb', `r5') 49 50ASM_START() 51PROLOGUE(mpn_mod_1s_4p) 52 lda r30, -64(r30) 53 stq r9, 8(r30) 54 ldq B1modb, 16(r19) 55 stq r10, 16(r30) 56 ldq B2modb, 24(r19) 57 stq r11, 24(r30) 58 ldq B3modb, 32(r19) 59 stq r12, 32(r30) 60 ldq B4modb, 40(r19) 61 stq r13, 40(r30) 62 ldq B5modb, 48(r19) 63 s8addq n, ap, ap C point ap at vector end 64 65 and n, 3, r0 66 lda n, -4(n) 67 beq r0, L(b0) 68 lda r6, -2(r0) 69 blt r6, L(b1) 70 beq r6, L(b2) 71 72L(b3): ldq r21, -16(ap) 73 ldq r22, -8(ap) 74 ldq r20, -24(ap) 75 mulq r21, B1modb, r8 76 umulh r21, B1modb, r12 77 mulq r22, B2modb, r9 78 umulh r22, B2modb, r13 79 addq r8, r20, pl 80 cmpult pl, r8, r0 81 addq r0, r12, ph 82 addq r9, pl, rl 83 cmpult rl, r9, r0 84 addq r13, ph, ph 85 addq r0, ph, rh 86 lda ap, -56(ap) 87 br L(com) 88 89L(b0): ldq r21, -24(ap) 90 ldq r22, -16(ap) 91 ldq r23, -8(ap) 92 ldq r20, -32(ap) 93 mulq r21, B1modb, r8 94 umulh r21, B1modb, r12 95 mulq r22, B2modb, r9 96 umulh r22, B2modb, r13 97 mulq r23, B3modb, r10 98 umulh r23, B3modb, r27 99 addq r8, r20, pl 100 cmpult pl, r8, r0 101 addq r0, r12, ph 102 addq r9, pl, pl 103 cmpult pl, r9, r0 104 addq r13, ph, ph 105 addq r0, ph, ph 106 addq r10, pl, rl 107 cmpult rl, r10, r0 108 addq r27, ph, ph 109 addq r0, ph, rh 110 lda ap, -64(ap) 111 br L(com) 112 113L(b1): bis r31, r31, rh 114 ldq rl, -8(ap) 115 lda ap, -40(ap) 116 br L(com) 117 118L(b2): ldq rh, -8(ap) 119 ldq rl, -16(ap) 120 lda ap, -48(ap) 121 122L(com): ble n, L(ed3) 123 ldq r21, 8(ap) 124 ldq r22, 16(ap) 125 ldq r23, 24(ap) 126 ldq r20, 0(ap) 127 lda n, -4(n) 128 lda ap, -32(ap) 129 mulq r21, B1modb, r8 130 umulh r21, B1modb, r12 131 mulq r22, B2modb, r9 132 umulh r22, B2modb, r13 133 mulq r23, B3modb, r10 134 umulh r23, B3modb, r27 135 mulq rl, B4modb, r11 136 umulh rl, B4modb, r28 137 ble n, L(ed2) 138 139 ALIGN(16) 140L(top): ldq r21, 8(ap) 141 mulq rh, B5modb, rl 142 addq r8, r20, pl 143 ldq r22, 16(ap) 144 cmpult pl, r8, r0 145 umulh rh, B5modb, rh 146 ldq r23, 24(ap) 147 addq r0, r12, ph 148 addq r9, pl, pl 149 mulq r21, B1modb, r8 150 cmpult pl, r9, r0 151 addq r13, ph, ph 152 umulh r21, B1modb, r12 153 lda ap, -32(ap) 154 addq r0, ph, ph 155 addq r10, pl, pl 156 mulq r22, B2modb, r9 157 cmpult pl, r10, r0 158 addq r27, ph, ph 159 addq r11, pl, pl 160 umulh r22, B2modb, r13 161 addq r0, ph, ph 162 cmpult pl, r11, r0 163 addq r28, ph, ph 164 mulq r23, B3modb, r10 165 ldq r20, 32(ap) 166 addq pl, rl, rl 167 umulh r23, B3modb, r27 168 addq r0, ph, ph 169 cmpult rl, pl, r0 170 mulq rl, B4modb, r11 171 addq ph, rh, rh 172 umulh rl, B4modb, r28 173 addq r0, rh, rh 174 lda n, -4(n) 175 bgt n, L(top) 176 177L(ed2): mulq rh, B5modb, rl 178 addq r8, r20, pl 179 umulh rh, B5modb, rh 180 cmpult pl, r8, r0 181 addq r0, r12, ph 182 addq r9, pl, pl 183 cmpult pl, r9, r0 184 addq r13, ph, ph 185 addq r0, ph, ph 186 addq r10, pl, pl 187 cmpult pl, r10, r0 188 addq r27, ph, ph 189 addq r11, pl, pl 190 addq r0, ph, ph 191 cmpult pl, r11, r0 192 addq r28, ph, ph 193 addq pl, rl, rl 194 addq r0, ph, ph 195 cmpult rl, pl, r0 196 addq ph, rh, rh 197 addq r0, rh, rh 198 199L(ed3): mulq rh, B1modb, r8 200 umulh rh, B1modb, rh 201 addq r8, rl, rl 202 cmpult rl, r8, r0 203 addq r0, rh, rh 204 205 ldq r24, 8(r19) C cnt 206 sll rh, r24, rh 207 subq r31, r24, r25 208 srl rl, r25, r2 209 sll rl, r24, rl 210 or r2, rh, rh 211 212 ldq r23, 0(r19) C bi 213 mulq rh, r23, r8 214 umulh rh, r23, r9 215 addq rh, 1, r7 216 addq r8, rl, r8 C ql 217 cmpult r8, rl, r0 218 addq r9, r7, r9 219 addq r0, r9, r9 C qh 220 mulq r9, r18, r21 C qh * b 221 subq rl, r21, rl 222 cmpult r8, rl, r0 C rl > ql 223 negq r0, r0 224 and r0, r18, r0 225 addq rl, r0, rl 226 cmpule r18, rl, r0 C rl >= b 227 negq r0, r0 228 and r0, r18, r0 229 subq rl, r0, rl 230 231 srl rl, r24, r0 232 233 ldq r9, 8(r30) 234 ldq r10, 16(r30) 235 ldq r11, 24(r30) 236 ldq r12, 32(r30) 237 ldq r13, 40(r30) 238 lda r30, 64(r30) 239 ret r31, (r26), 1 240EPILOGUE() 241 242PROLOGUE(mpn_mod_1s_4p_cps,gp) 243 lda r30, -32(r30) 244 stq r26, 0(r30) 245 stq r9, 8(r30) 246 stq r10, 16(r30) 247 stq r11, 24(r30) 248 mov r16, r11 249 LEA( r4, __clz_tab) 250 lda r10, 65(r31) 251 cmpbge r31, r17, r1 252 srl r1, 1, r1 253 xor r1, 127, r1 254 addq r1, r4, r1 255 ldq_u r2, 0(r1) 256 extbl r2, r1, r2 257 s8subq r2, 7, r2 258 srl r17, r2, r3 259 subq r10, r2, r10 260 addq r3, r4, r3 261 ldq_u r1, 0(r3) 262 extbl r1, r3, r1 263 subq r10, r1, r10 264 sll r17, r10, r9 265 mov r9, r16 266 jsr r26, mpn_invert_limb 267 ldah r29, 0(r26) 268 subq r31, r10, r2 269 lda r1, 1(r31) 270 sll r1, r10, r1 271 subq r31, r9, r3 272 srl r0, r2, r2 273 ldq r26, 0(r30) 274 bis r2, r1, r2 275 lda r29, 0(r29) 276 stq r0, 0(r11) 277 stq r10, 8(r11) 278 mulq r2, r3, r2 279 srl r2, r10, r3 280 umulh r2, r0, r1 281 stq r3, 16(r11) 282 mulq r2, r0, r3 283 ornot r31, r1, r1 284 subq r1, r2, r1 285 mulq r1, r9, r1 286 addq r1, r9, r2 287 cmpule r1, r3, r3 288 cmoveq r3, r2, r1 289 srl r1, r10, r3 290 umulh r1, r0, r2 291 stq r3, 24(r11) 292 mulq r1, r0, r3 293 ornot r31, r2, r2 294 subq r2, r1, r2 295 mulq r2, r9, r2 296 addq r2, r9, r1 297 cmpule r2, r3, r3 298 cmoveq r3, r1, r2 299 srl r2, r10, r1 300 umulh r2, r0, r3 301 stq r1, 32(r11) 302 mulq r2, r0, r1 303 ornot r31, r3, r3 304 subq r3, r2, r3 305 mulq r3, r9, r3 306 addq r3, r9, r2 307 cmpule r3, r1, r1 308 cmoveq r1, r2, r3 309 srl r3, r10, r2 310 umulh r3, r0, r1 311 stq r2, 40(r11) 312 mulq r3, r0, r0 313 ornot r31, r1, r1 314 subq r1, r3, r1 315 mulq r1, r9, r1 316 addq r1, r9, r9 317 cmpule r1, r0, r0 318 cmoveq r0, r9, r1 319 ldq r9, 8(r30) 320 srl r1, r10, r1 321 ldq r10, 16(r30) 322 stq r1, 48(r11) 323 ldq r11, 24(r30) 324 lda r30, 32(r30) 325 ret r31, (r26), 1 326EPILOGUE() 327