1dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. 2 3dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C alignment dst/src, A=0mod8 N=4mod8 24C A/A A/N N/A N/N 25C K7 0.75 1.0 1.0 0.75 26 27 28C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); 29C 30C Copy src,size to dst,size. 31C 32C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at 33C 1.33 c/l. 34C 35C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization 36C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing 37C under 0.7 c/l is known. Apparently only two 32-bit stores can be done in 38C one cycle, so perhaps some scheduling is needed to ensure it's a 39C load+store in each cycle, not store+store. 40C 41C If both source and destination are unaligned then one limb is processed at 42C the start to make them aligned and so get 0.75 c/l, whereas if they'd been 43C used unaligned it would be 1.5 c/l. 44 45defframe(PARAM_SIZE,12) 46defframe(PARAM_SRC, 8) 47defframe(PARAM_DST, 4) 48 49dnl parameter space reused 50define(SAVE_EBX,`PARAM_SIZE') 51 52dnl minimum 5 since the unrolled code can't handle less than 5 53deflit(UNROLL_THRESHOLD, 5) 54 55 TEXT 56 ALIGN(32) 57PROLOGUE(mpn_copyi) 58deflit(`FRAME',0) 59 60 movl PARAM_SIZE, %ecx 61 movl %ebx, SAVE_EBX 62 63 movl PARAM_SRC, %eax 64 movl PARAM_DST, %edx 65 66 cmpl $UNROLL_THRESHOLD, %ecx 67 jae L(unroll) 68 69 orl %ecx, %ecx 70 jz L(simple_done) 71 72L(simple): 73 C eax src, incrementing 74 C ebx scratch 75 C ecx counter 76 C edx dst, incrementing 77 C 78 C this loop is 2 cycles/limb 79 80 movl (%eax), %ebx 81 movl %ebx, (%edx) 82 decl %ecx 83 leal 4(%eax), %eax 84 leal 4(%edx), %edx 85 jnz L(simple) 86 87L(simple_done): 88 movl SAVE_EBX, %ebx 89 ret 90 91 92L(unroll): 93 movl %eax, %ebx 94 leal -12(%eax,%ecx,4), %eax C src end - 12 95 subl $3, %ecx C size-3 96 97 andl %edx, %ebx 98 leal (%edx,%ecx,4), %edx C dst end - 12 99 negl %ecx 100 101 testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) 102 jz L(aligned) 103 104 C both src and dst unaligned, process one limb to align them 105 movl (%eax,%ecx,4), %ebx 106 movl %ebx, (%edx,%ecx,4) 107 incl %ecx 108L(aligned): 109 110 111 ALIGN(16) 112L(top): 113 C eax src end - 12 114 C ebx 115 C ecx counter, negative, limbs 116 C edx dst end - 12 117 118 movq (%eax,%ecx,4), %mm0 119 movq 8(%eax,%ecx,4), %mm1 120 addl $4, %ecx 121 movq %mm0, -16(%edx,%ecx,4) 122 movq %mm1, -16+8(%edx,%ecx,4) 123 ja L(top) C jump no carry and not zero 124 125 126 C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining 127 128 testb $2, %cl 129 jnz L(finish_not_two) 130 131 movq (%eax,%ecx,4), %mm0 132 movq %mm0, (%edx,%ecx,4) 133L(finish_not_two): 134 135 testb $1, %cl 136 jnz L(done) 137 138 movl 8(%eax), %ebx 139 movl %ebx, 8(%edx) 140 141L(done): 142 movl SAVE_EBX, %ebx 143 emms 144 ret 145 146EPILOGUE() 147