1dnl Intel P6 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P6: 2.0 cycles/limb 24 25C TODO 26C Experiments with more unrolling indicate that 1.5 c/l is possible on P6-13 27C with the current carry handling scheme. 28 29C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 30C 31C Groups of three limbs are handled, with carry bits from 0mod3 into 1mod3 32C into 2mod3, but at that point going into a separate carries total so we 33C don't keep the carry flag live across the loop control. Avoiding decl 34C lets us get to 2.0 c/l, as compared to the generic x86 code at 3.66. 35C 36 37defframe(PARAM_SIZE, 8) 38defframe(PARAM_SRC, 4) 39 40dnl re-use parameter space 41define(SAVE_EBX, `PARAM_SIZE') 42define(SAVE_ESI, `PARAM_SRC') 43 44 TEXT 45 ALIGN(16) 46PROLOGUE(mpn_mod_34lsub1) 47deflit(`FRAME',0) 48 49 movl PARAM_SIZE, %ecx 50 movl PARAM_SRC, %edx 51 52 subl $2, %ecx C size-2 53 movl (%edx), %eax C src[0] 54 ja L(three_or_more) 55 jb L(one) 56 57 C size==2 58 59 movl 4(%edx), %ecx C src[1] 60 61 movl %eax, %edx C src[0] 62 shrl $24, %eax C src[0] high 63 64 andl $0xFFFFFF, %edx C src[0] low 65 66 addl %edx, %eax 67 movl %ecx, %edx C src[1] 68 shrl $16, %ecx C src[1] high 69 70 andl $0xFFFF, %edx 71 addl %ecx, %eax 72 73 shll $8, %edx C src[1] low 74 75 addl %edx, %eax 76L(one): 77 ret 78 79 80L(three_or_more): 81 C eax src[0], initial acc 0mod3 82 C ebx 83 C ecx size-2 84 C edx src 85 C esi 86 C edi 87 C ebp 88 89 movl %ebx, SAVE_EBX 90 movl 4(%edx), %ebx C src[1], initial 1mod3 91 subl $3, %ecx C size-5 92 93 movl %esi, SAVE_ESI 94 movl 8(%edx), %esi C src[2], initial 2mod3 95 96 pushl %edi FRAME_pushl() 97 movl $0, %edi C initial carries 0mod3 98 jng L(done) C if size < 6 99 100 101L(top): 102 C eax acc 0mod3 103 C ebx acc 1mod3 104 C ecx counter, limbs 105 C edx src 106 C esi acc 2mod3 107 C edi carrys into 0mod3 108 C ebp 109 110 addl 12(%edx), %eax 111 adcl 16(%edx), %ebx 112 adcl 20(%edx), %esi 113 leal 12(%edx), %edx 114 adcl $0, %edi 115 116 subl $3, %ecx 117 jg L(top) C at least 3 more to process 118 119 120L(done): 121 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs respectively 122 cmpl $-1, %ecx 123 jl L(done_0) C if -2, meaning 0 more limbs 124 125 C 1 or 2 more limbs 126 movl $0, %ecx 127 je L(done_1) C if -1, meaning 1 more limb only 128 movl 16(%edx), %ecx 129L(done_1): 130 addl 12(%edx), %eax C 0mod3 131 adcl %ecx, %ebx C 1mod3 132 adcl $0, %esi C 2mod3 133 adcl $0, %edi C carries 0mod3 134 135L(done_0): 136 C eax acc 0mod3 137 C ebx acc 1mod3 138 C ecx 139 C edx 140 C esi acc 2mod3 141 C edi carries 0mod3 142 C ebp 143 144 movl %eax, %ecx C 0mod3 145 shrl $24, %eax C 0mod3 high initial total 146 147 andl $0xFFFFFF, %ecx C 0mod3 low 148 movl %edi, %edx C carries 149 shrl $24, %edi C carries high 150 151 addl %ecx, %eax C add 0mod3 low 152 andl $0xFFFFFF, %edx C carries 0mod3 low 153 movl %ebx, %ecx C 1mod3 154 155 shrl $16, %ebx C 1mod3 high 156 addl %edi, %eax C add carries high 157 addl %edx, %eax C add carries 0mod3 low 158 159 andl $0xFFFF, %ecx C 1mod3 low mask 160 addl %ebx, %eax C add 1mod3 high 161 movl SAVE_EBX, %ebx 162 163 shll $8, %ecx C 1mod3 low 164 movl %esi, %edx C 2mod3 165 popl %edi FRAME_popl() 166 167 shrl $8, %esi C 2mod3 high 168 andl $0xFF, %edx C 2mod3 low mask 169 addl %ecx, %eax C add 1mod3 low 170 171 shll $16, %edx C 2mod3 low 172 addl %esi, %eax C add 2mod3 high 173 movl SAVE_ESI, %esi 174 175 addl %edx, %eax C add 2mod3 low 176 177 ret 178 179EPILOGUE() 180