1dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3dnl Copyright 2000, 2001, 2002, 2004, 2005, 2008 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C Athlon: 1 26C Hammer: 1 27 28 29C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 30C 31C The loop form below and the 64 byte code alignment seem necessary for the 32C claimed speed. This is a bit strange, since normally k7 isn't very 33C sensitive to such things. Perhaps there has to be 6 instructions in the 34C first 16 bytes for the BTB entry or something. 35 36defframe(PARAM_SIZE, 8) 37defframe(PARAM_SRC, 4) 38 39dnl re-use parameter space 40define(SAVE_EDI, `PARAM_SIZE') 41 42 TEXT 43 ALIGN(64) 44PROLOGUE(mpn_mod_34lsub1) 45deflit(`FRAME',0) 46 47 movl PARAM_SIZE, %ecx 48 movl PARAM_SRC, %edx 49 50 subl $2, %ecx 51 ja L(three_or_more) 52 53 movl (%edx), %eax 54 jb L(one) 55 56 movl 4(%edx), %ecx 57 movl %eax, %edx 58 shrl $24, %eax C src[0] low 59 60 andl $0xFFFFFF, %edx C src[0] high 61 addl %edx, %eax 62 movl %ecx, %edx 63 64 andl $0xFFFF, %ecx 65 shrl $16, %edx C src[1] high 66 addl %edx, %eax 67 68 shll $8, %ecx C src[1] low 69 addl %ecx, %eax 70 71L(one): 72 ret 73 74 75L(three_or_more): 76 C eax 77 C ebx 78 C ecx size-2 79 C edx src 80 C esi 81 C edi 82 83 pushl %ebx FRAME_pushl() 84 xorl %eax, %eax 85 xorl %ebx, %ebx 86 87 movl %edi, SAVE_EDI 88 pushl %esi FRAME_pushl() 89 xorl %esi, %esi C and clear carry flag 90 91 92 C code offset 0x40 at this point 93L(top): 94 C eax acc 0mod3 95 C ebx acc 1mod3 96 C ecx counter, limbs 97 C edx src 98 C esi acc 2mod3 99 C edi 100 101 leal 24(%edx), %edx 102 leal -2(%ecx), %ecx 103 adcl -24(%edx), %eax 104 adcl -20(%edx), %ebx 105 adcl -16(%edx), %esi 106 107 decl %ecx 108 jng L(done_loop) 109 110 leal -2(%ecx), %ecx 111 adcl -12(%edx), %eax 112 adcl -8(%edx), %ebx 113 adcl -4(%edx), %esi 114 115 decl %ecx 116 jg L(top) 117 118 119 leal 12(%edx), %edx 120 121 122L(done_loop): 123 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively 124 125 incl %ecx 126 movl $0xFFFFFFFF, %edi 127 js L(combine) 128 129 adcl -12(%edx), %eax 130 decl %ecx 131 movl $0xFFFFFF00, %edi 132 js L(combine) 133 134 adcl -8(%edx), %ebx 135 movl $0xFFFF0000, %edi 136 137 138L(combine): 139 C eax acc 0mod3 140 C ebx acc 1mod3 141 C ecx 142 C edx 143 C esi acc 2mod3 144 C edi mask 145 146 sbbl %ecx, %ecx C carry 147 movl %eax, %edx C 0mod3 148 shrl $24, %eax C 0mod3 high 149 150 andl %edi, %ecx C carry masked 151 andl $0x00FFFFFF, %edx C 0mod3 low 152 movl %ebx, %edi C 1mod3 153 154 subl %ecx, %eax C apply carry 155 shrl $16, %ebx C 1mod3 high 156 andl $0xFFFF, %edi 157 158 addl %edx, %eax C apply 0mod3 low 159 movl %esi, %edx C 2mod3 160 shll $8, %edi C 1mod3 low 161 162 addl %ebx, %eax C apply 1mod3 high 163 shrl $8, %esi C 2mod3 high 164 movzbl %dl, %edx C 2mod3 low 165 166 addl %edi, %eax C apply 1mod3 low 167 shll $16, %edx C 2mod3 low 168 169 addl %esi, %eax C apply 2mod3 high 170 popl %esi FRAME_popl() 171 172 movl SAVE_EDI, %edi 173 addl %edx, %eax C apply 2mod3 low 174 popl %ebx FRAME_popl() 175 176 ret 177 178EPILOGUE() 179