1dnl AMD K7 mpn_copyi -- copy limb vector, incrementing. 2 3dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C alignment dst/src, A=0mod8 N=4mod8 35C A/A A/N N/A N/N 36C K7 0.75 1.0 1.0 0.75 37 38 39C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size); 40C 41C Copy src,size to dst,size. 42C 43C This code at 0.75 or 1.0 c/l is always faster than a plain rep movsl at 44C 1.33 c/l. 45C 46C The K7 can do a 64-bit load and 64-bit store in one cycle (optimization 47C guile 22007 appendix B), so 0.5 c/l should be possible, however nothing 48C under 0.7 c/l is known. Apparently only two 32-bit stores can be done in 49C one cycle, so perhaps some scheduling is needed to ensure it's a 50C load+store in each cycle, not store+store. 51C 52C If both source and destination are unaligned then one limb is processed at 53C the start to make them aligned and so get 0.75 c/l, whereas if they'd been 54C used unaligned it would be 1.5 c/l. 55 56defframe(PARAM_SIZE,12) 57defframe(PARAM_SRC, 8) 58defframe(PARAM_DST, 4) 59 60dnl parameter space reused 61define(SAVE_EBX,`PARAM_SIZE') 62 63dnl minimum 5 since the unrolled code can't handle less than 5 64deflit(UNROLL_THRESHOLD, 5) 65 66 TEXT 67 ALIGN(32) 68PROLOGUE(mpn_copyi) 69deflit(`FRAME',0) 70 71 movl PARAM_SIZE, %ecx 72 movl %ebx, SAVE_EBX 73 74 movl PARAM_SRC, %eax 75 movl PARAM_DST, %edx 76 77 cmpl $UNROLL_THRESHOLD, %ecx 78 jae L(unroll) 79 80 orl %ecx, %ecx 81 jz L(simple_done) 82 83L(simple): 84 C eax src, incrementing 85 C ebx scratch 86 C ecx counter 87 C edx dst, incrementing 88 C 89 C this loop is 2 cycles/limb 90 91 movl (%eax), %ebx 92 movl %ebx, (%edx) 93 decl %ecx 94 leal 4(%eax), %eax 95 leal 4(%edx), %edx 96 jnz L(simple) 97 98L(simple_done): 99 movl SAVE_EBX, %ebx 100 ret 101 102 103L(unroll): 104 movl %eax, %ebx 105 leal -12(%eax,%ecx,4), %eax C src end - 12 106 subl $3, %ecx C size-3 107 108 andl %edx, %ebx 109 leal (%edx,%ecx,4), %edx C dst end - 12 110 negl %ecx 111 112 testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) 113 jz L(aligned) 114 115 C both src and dst unaligned, process one limb to align them 116 movl (%eax,%ecx,4), %ebx 117 movl %ebx, (%edx,%ecx,4) 118 incl %ecx 119L(aligned): 120 121 122 ALIGN(16) 123L(top): 124 C eax src end - 12 125 C ebx 126 C ecx counter, negative, limbs 127 C edx dst end - 12 128 129 movq (%eax,%ecx,4), %mm0 130 movq 8(%eax,%ecx,4), %mm1 131 addl $4, %ecx 132 movq %mm0, -16(%edx,%ecx,4) 133 movq %mm1, -16+8(%edx,%ecx,4) 134 ja L(top) C jump no carry and not zero 135 136 137 C now %ecx is 0 to 3 representing respectively 3 to 0 limbs remaining 138 139 testb $2, %cl 140 jnz L(finish_not_two) 141 142 movq (%eax,%ecx,4), %mm0 143 movq %mm0, (%edx,%ecx,4) 144L(finish_not_two): 145 146 testb $1, %cl 147 jnz L(done) 148 149 movl 8(%eax), %ebx 150 movl %ebx, 8(%edx) 151 152L(done): 153 movl SAVE_EBX, %ebx 154 emms 155 ret 156 157EPILOGUE() 158