1dnl Pentium-4 mpn_copyi -- copy limb vector, incrementing. 2dnl 3 4dnl Copyright 1999, 2000, 2001 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21 22dnl The rep/movsl is very slow for small blocks on pentium4. Its startup 23dnl time seems to be about 110 cycles. It then copies at a rate of one 24dnl limb per cycle. We therefore fall back to an open-coded 2 c/l copying 25dnl loop for smaller sizes. 26 27dnl Ultimately, we may want to use 64-bit movd or 128-bit movdqu in some 28dnl nifty unrolled arrangement. Clearly, that could reach much higher 29dnl speeds, at least for large blocks. 30 31include(`../config.m4') 32 33 34defframe(PARAM_SIZE, 12) 35defframe(PARAM_SRC, 8) 36defframe(PARAM_DST, 4) 37 38 TEXT 39 ALIGN(8) 40 41PROLOGUE(mpn_copyi) 42deflit(`FRAME',0) 43 44 movl PARAM_SIZE, %ecx 45 cmpl $150, %ecx 46 jg L(replmovs) 47 48 movl PARAM_SRC, %eax 49 movl PARAM_DST, %edx 50 movl %ebx, PARAM_SIZE 51 testl %ecx, %ecx 52 jz L(end) 53 54L(loop): 55 movl (%eax), %ebx 56 leal 4(%eax), %eax 57 addl $-1, %ecx 58 movl %ebx, (%edx) 59 leal 4(%edx), %edx 60 61 jnz L(loop) 62 63L(end): 64 movl PARAM_SIZE, %ebx 65 ret 66 67L(replmovs): 68 cld C better safe than sorry, see mpn/x86/README 69 70 movl %esi, %eax 71 movl PARAM_SRC, %esi 72 movl %edi, %edx 73 movl PARAM_DST, %edi 74 75 rep 76 movsl 77 78 movl %eax, %esi 79 movl %edx, %edi 80 81 ret 82 83EPILOGUE() 84