1dnl AMD K6-2 mpn_copyd -- copy limb vector, decrementing. 2 3dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C K6-2: 1.0 cycles/limb 24 25 26C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); 27C 28C The loop here is no faster than a rep movsl at 1.0 c/l, but it avoids a 30 29C cycle startup time, which amounts for instance to a 2x speedup at 15 30C limbs. 31C 32C If dst is 4mod8 the loop would be 1.17 c/l, but that's avoided by 33C processing one limb separately to make it aligned. This and a final odd 34C limb are handled in a branch-free fashion, ending up re-copying if the 35C special case isn't needed. 36C 37C Alternatives: 38C 39C There used to be a big unrolled version of this, running at 0.56 c/l if 40C the destination was aligned, but that seemed rather excessive for the 41C relative importance of copyd. 42C 43C If the destination alignment is ignored and just left to run at 1.17 c/l 44C some code size and a fixed few cycles can be saved. Considering how few 45C uses copyd finds perhaps that should be favoured. The current code has 46C the attraction of being no slower than a basic rep movsl though. 47 48defframe(PARAM_SIZE,12) 49defframe(PARAM_SRC, 8) 50defframe(PARAM_DST, 4) 51 52dnl re-using parameter space 53define(SAVE_EBX,`PARAM_SIZE') 54 55 TEXT 56 ALIGN(16) 57 58PROLOGUE(mpn_copyd) 59deflit(`FRAME',0) 60 61 movl PARAM_SIZE, %ecx 62 movl %ebx, SAVE_EBX 63 64 movl PARAM_SRC, %eax 65 movl PARAM_DST, %edx 66 67 subl $1, %ecx C better code alignment than decl 68 jb L(zero) 69 70 jz L(one_more) 71 leal 4(%edx,%ecx,4), %ebx 72 73Zdisp( movd, 0,(%eax,%ecx,4), %mm0) C high limb 74Zdisp( movd, %mm0, 0,(%edx,%ecx,4)) C Zdisp for good code alignment 75 76 cmpl $1, %ecx 77 je L(one_more) 78 79 shrl $2, %ebx 80 andl $1, %ebx C 1 if dst[size-2] unaligned 81 82 subl %ebx, %ecx 83 nop C code alignment 84 85L(top): 86 C eax src 87 C ebx 88 C ecx counter 89 C edx dst 90 91 movq -4(%eax,%ecx,4), %mm0 92 subl $2, %ecx 93 94 movq %mm0, 4(%edx,%ecx,4) 95 ja L(top) 96 97 98L(one_more): 99 movd (%eax), %mm0 100 movd %mm0, (%edx) 101 102 movl SAVE_EBX, %ebx 103 emms_or_femms 104L(zero): 105 ret 106 107EPILOGUE() 108