1dnl AMD K7 mpn_mod_34lsub1 -- remainder modulo 2^24-1. 2 3dnl Copyright 2000-2002, 2004, 2005, 2008 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C Athlon: 1 36C Hammer: 1 37 38 39C mp_limb_t mpn_mod_34lsub1 (mp_srcptr src, mp_size_t size) 40C 41C The loop form below and the 64 byte code alignment seem necessary for the 42C claimed speed. This is a bit strange, since normally k7 isn't very 43C sensitive to such things. Perhaps there has to be 6 instructions in the 44C first 16 bytes for the BTB entry or something. 45 46defframe(PARAM_SIZE, 8) 47defframe(PARAM_SRC, 4) 48 49dnl re-use parameter space 50define(SAVE_EDI, `PARAM_SIZE') 51 52 TEXT 53 ALIGN(64) 54PROLOGUE(mpn_mod_34lsub1) 55deflit(`FRAME',0) 56 57 movl PARAM_SIZE, %ecx 58 movl PARAM_SRC, %edx 59 60 subl $2, %ecx 61 ja L(three_or_more) 62 63 movl (%edx), %eax 64 jb L(one) 65 66 movl 4(%edx), %ecx 67 movl %eax, %edx 68 shrl $24, %eax C src[0] low 69 70 andl $0xFFFFFF, %edx C src[0] high 71 addl %edx, %eax 72 movl %ecx, %edx 73 74 andl $0xFFFF, %ecx 75 shrl $16, %edx C src[1] high 76 addl %edx, %eax 77 78 shll $8, %ecx C src[1] low 79 addl %ecx, %eax 80 81L(one): 82 ret 83 84 85L(three_or_more): 86 C eax 87 C ebx 88 C ecx size-2 89 C edx src 90 C esi 91 C edi 92 93 pushl %ebx FRAME_pushl() 94 xorl %eax, %eax 95 xorl %ebx, %ebx 96 97 movl %edi, SAVE_EDI 98 pushl %esi FRAME_pushl() 99 xorl %esi, %esi C and clear carry flag 100 101 102 C code offset 0x40 at this point 103L(top): 104 C eax acc 0mod3 105 C ebx acc 1mod3 106 C ecx counter, limbs 107 C edx src 108 C esi acc 2mod3 109 C edi 110 111 leal 24(%edx), %edx 112 leal -2(%ecx), %ecx 113 adcl -24(%edx), %eax 114 adcl -20(%edx), %ebx 115 adcl -16(%edx), %esi 116 117 decl %ecx 118 jng L(done_loop) 119 120 leal -2(%ecx), %ecx 121 adcl -12(%edx), %eax 122 adcl -8(%edx), %ebx 123 adcl -4(%edx), %esi 124 125 decl %ecx 126 jg L(top) 127 128 129 leal 12(%edx), %edx 130 131 132L(done_loop): 133 C ecx is -2, -1 or 0 representing 0, 1 or 2 more limbs, respectively 134 135 incl %ecx 136 movl $0xFFFFFFFF, %edi 137 js L(combine) 138 139 adcl -12(%edx), %eax 140 decl %ecx 141 movl $0xFFFFFF00, %edi 142 js L(combine) 143 144 adcl -8(%edx), %ebx 145 movl $0xFFFF0000, %edi 146 147 148L(combine): 149 C eax acc 0mod3 150 C ebx acc 1mod3 151 C ecx 152 C edx 153 C esi acc 2mod3 154 C edi mask 155 156 sbbl %ecx, %ecx C carry 157 movl %eax, %edx C 0mod3 158 shrl $24, %eax C 0mod3 high 159 160 andl %edi, %ecx C carry masked 161 andl $0x00FFFFFF, %edx C 0mod3 low 162 movl %ebx, %edi C 1mod3 163 164 subl %ecx, %eax C apply carry 165 shrl $16, %ebx C 1mod3 high 166 andl $0xFFFF, %edi 167 168 addl %edx, %eax C apply 0mod3 low 169 movl %esi, %edx C 2mod3 170 shll $8, %edi C 1mod3 low 171 172 addl %ebx, %eax C apply 1mod3 high 173 shrl $8, %esi C 2mod3 high 174 movzbl %dl, %edx C 2mod3 low 175 176 addl %edi, %eax C apply 1mod3 low 177 shll $16, %edx C 2mod3 low 178 179 addl %esi, %eax C apply 2mod3 high 180 popl %esi FRAME_popl() 181 182 movl SAVE_EDI, %edi 183 addl %edx, %eax C apply 2mod3 low 184 popl %ebx FRAME_popl() 185 186 ret 187 188EPILOGUE() 189