1dnl Pentium-4 mpn_copyd -- copy limb vector, decrementing. 2 3dnl Copyright 1999-2001 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32dnl The std/rep/movsl/cld is very slow for small blocks on pentium4. Its 33dnl startup time seems to be about 165 cycles. It then needs 2.6 c/l. 34dnl We therefore use an open-coded 2 c/l copying loop. 35 36dnl Ultimately, we may want to use 64-bit movq or 128-bit movdqu in some 37dnl nifty unrolled arrangement. Clearly, that could reach much higher 38dnl speeds, at least for large blocks. 39 40include(`../config.m4') 41 42 43defframe(PARAM_SIZE, 12) 44defframe(PARAM_SRC, 8) 45defframe(PARAM_DST, 4) 46 47 TEXT 48 ALIGN(8) 49 50PROLOGUE(mpn_copyd) 51deflit(`FRAME',0) 52 53 movl PARAM_SIZE, %ecx 54 55 movl PARAM_SRC, %eax 56 movl PARAM_DST, %edx 57 movl %ebx, PARAM_SIZE 58 addl $-1, %ecx 59 js L(end) 60 61L(loop): 62 movl (%eax,%ecx,4), %ebx 63 movl %ebx, (%edx,%ecx,4) 64 addl $-1, %ecx 65 66 jns L(loop) 67L(end): 68 movl PARAM_SIZE, %ebx 69 ret 70 71EPILOGUE() 72