1dnl x86 mpn_mul_basecase -- Multiply two limb vectors and store the result 2dnl in a third limb vector. 3 4dnl Copyright 1996, 1997, 1998, 1999, 2000, 2001, 2002 Free Software 5dnl Foundation, Inc. 6dnl 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or 10dnl modify it under the terms of the GNU Lesser General Public License as 11dnl published by the Free Software Foundation; either version 3 of the 12dnl License, or (at your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, 15dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 16dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17dnl Lesser General Public License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/crossproduct 26C P5: 15 27C P6: 7.5 28C K6: 12.5 29C K7: 5.5 30C P4: 24 31 32 33C void mpn_mul_basecase (mp_ptr wp, 34C mp_srcptr xp, mp_size_t xsize, 35C mp_srcptr yp, mp_size_t ysize); 36C 37C This was written in a haste since the Pentium optimized code that was used 38C for all x86 machines was slow for the Pentium II. This code would benefit 39C from some cleanup. 40C 41C To shave off some percentage of the run-time, one should make 4 variants 42C of the Louter loop, for the four different outcomes of un mod 4. That 43C would avoid Loop0 altogether. Code expansion would be > 4-fold for that 44C part of the function, but since it is not very large, that would be 45C acceptable. 46C 47C The mul loop (at L(oopM)) might need some tweaking. It's current speed is 48C unknown. 49 50defframe(PARAM_YSIZE,20) 51defframe(PARAM_YP, 16) 52defframe(PARAM_XSIZE,12) 53defframe(PARAM_XP, 8) 54defframe(PARAM_WP, 4) 55 56defframe(VAR_MULTIPLIER, -4) 57defframe(VAR_COUNTER, -8) 58deflit(VAR_STACK_SPACE, 8) 59 60 TEXT 61 ALIGN(8) 62 63PROLOGUE(mpn_mul_basecase) 64deflit(`FRAME',0) 65 66 subl $VAR_STACK_SPACE,%esp 67 pushl %esi 68 pushl %ebp 69 pushl %edi 70deflit(`FRAME',eval(VAR_STACK_SPACE+12)) 71 72 movl PARAM_XP,%esi 73 movl PARAM_WP,%edi 74 movl PARAM_YP,%ebp 75 76 movl (%esi),%eax C load xp[0] 77 mull (%ebp) C multiply by yp[0] 78 movl %eax,(%edi) C store to wp[0] 79 movl PARAM_XSIZE,%ecx C xsize 80 decl %ecx C If xsize = 1, ysize = 1 too 81 jz L(done) 82 83 pushl %ebx 84FRAME_pushl() 85 movl %edx,%ebx 86 87 leal 4(%esi),%esi 88 leal 4(%edi),%edi 89 90L(oopM): 91 movl (%esi),%eax C load next limb at xp[j] 92 leal 4(%esi),%esi 93 mull (%ebp) 94 addl %ebx,%eax 95 movl %edx,%ebx 96 adcl $0,%ebx 97 movl %eax,(%edi) 98 leal 4(%edi),%edi 99 decl %ecx 100 jnz L(oopM) 101 102 movl %ebx,(%edi) C most significant limb of product 103 addl $4,%edi C increment wp 104 movl PARAM_XSIZE,%eax 105 shll $2,%eax 106 subl %eax,%edi 107 subl %eax,%esi 108 109 movl PARAM_YSIZE,%eax C ysize 110 decl %eax 111 jz L(skip) 112 movl %eax,VAR_COUNTER C set index i to ysize 113 114L(outer): 115 movl PARAM_YP,%ebp C yp 116 addl $4,%ebp C make ebp point to next v limb 117 movl %ebp,PARAM_YP 118 movl (%ebp),%eax C copy y limb ... 119 movl %eax,VAR_MULTIPLIER C ... to stack slot 120 movl PARAM_XSIZE,%ecx 121 122 xorl %ebx,%ebx 123 andl $3,%ecx 124 jz L(end0) 125 126L(oop0): 127 movl (%esi),%eax 128 mull VAR_MULTIPLIER 129 leal 4(%esi),%esi 130 addl %ebx,%eax 131 movl $0,%ebx 132 adcl %ebx,%edx 133 addl %eax,(%edi) 134 adcl %edx,%ebx C propagate carry into cylimb 135 136 leal 4(%edi),%edi 137 decl %ecx 138 jnz L(oop0) 139 140L(end0): 141 movl PARAM_XSIZE,%ecx 142 shrl $2,%ecx 143 jz L(endX) 144 145 ALIGN(8) 146L(oopX): 147 movl (%esi),%eax 148 mull VAR_MULTIPLIER 149 addl %eax,%ebx 150 movl $0,%ebp 151 adcl %edx,%ebp 152 153 movl 4(%esi),%eax 154 mull VAR_MULTIPLIER 155 addl %ebx,(%edi) 156 adcl %eax,%ebp C new lo + cylimb 157 movl $0,%ebx 158 adcl %edx,%ebx 159 160 movl 8(%esi),%eax 161 mull VAR_MULTIPLIER 162 addl %ebp,4(%edi) 163 adcl %eax,%ebx C new lo + cylimb 164 movl $0,%ebp 165 adcl %edx,%ebp 166 167 movl 12(%esi),%eax 168 mull VAR_MULTIPLIER 169 addl %ebx,8(%edi) 170 adcl %eax,%ebp C new lo + cylimb 171 movl $0,%ebx 172 adcl %edx,%ebx 173 174 addl %ebp,12(%edi) 175 adcl $0,%ebx C propagate carry into cylimb 176 177 leal 16(%esi),%esi 178 leal 16(%edi),%edi 179 decl %ecx 180 jnz L(oopX) 181 182L(endX): 183 movl %ebx,(%edi) 184 addl $4,%edi 185 186 C we incremented wp and xp in the loop above; compensate 187 movl PARAM_XSIZE,%eax 188 shll $2,%eax 189 subl %eax,%edi 190 subl %eax,%esi 191 192 movl VAR_COUNTER,%eax 193 decl %eax 194 movl %eax,VAR_COUNTER 195 jnz L(outer) 196 197L(skip): 198 popl %ebx 199 popl %edi 200 popl %ebp 201 popl %esi 202 addl $8,%esp 203 ret 204 205L(done): 206 movl %edx,4(%edi) C store to wp[1] 207 popl %edi 208 popl %ebp 209 popl %esi 210 addl $8,%esp 211 ret 212 213EPILOGUE() 214