1dnl AMD K7 mpn_copyd -- copy limb vector, decrementing. 2 3dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C alignment dst/src, A=0mod8 N=4mod8 35C A/A A/N N/A N/N 36C K7 0.75 1.0 1.0 0.75 37 38 39C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); 40C 41C The various comments in mpn/x86/k7/copyi.asm apply here too. 42 43defframe(PARAM_SIZE,12) 44defframe(PARAM_SRC, 8) 45defframe(PARAM_DST, 4) 46deflit(`FRAME',0) 47 48dnl parameter space reused 49define(SAVE_EBX,`PARAM_SIZE') 50define(SAVE_ESI,`PARAM_SRC') 51 52dnl minimum 5 since the unrolled code can't handle less than 5 53deflit(UNROLL_THRESHOLD, 5) 54 55 TEXT 56 ALIGN(32) 57PROLOGUE(mpn_copyd) 58 59 movl PARAM_SIZE, %ecx 60 movl %ebx, SAVE_EBX 61 62 movl PARAM_SRC, %eax 63 movl PARAM_DST, %edx 64 65 cmpl $UNROLL_THRESHOLD, %ecx 66 jae L(unroll) 67 68 orl %ecx, %ecx 69 jz L(simple_done) 70 71L(simple): 72 C eax src 73 C ebx scratch 74 C ecx counter 75 C edx dst 76 C 77 C this loop is 2 cycles/limb 78 79 movl -4(%eax,%ecx,4), %ebx 80 movl %ebx, -4(%edx,%ecx,4) 81 decl %ecx 82 jnz L(simple) 83 84L(simple_done): 85 movl SAVE_EBX, %ebx 86 ret 87 88 89L(unroll): 90 movl %esi, SAVE_ESI 91 leal (%eax,%ecx,4), %ebx 92 leal (%edx,%ecx,4), %esi 93 94 andl %esi, %ebx 95 movl SAVE_ESI, %esi 96 subl $4, %ecx C size-4 97 98 testl $4, %ebx C testl to pad code closer to 16 bytes for L(top) 99 jz L(aligned) 100 101 C both src and dst unaligned, process one limb to align them 102 movl 12(%eax,%ecx,4), %ebx 103 movl %ebx, 12(%edx,%ecx,4) 104 decl %ecx 105L(aligned): 106 107 108 ALIGN(16) 109L(top): 110 C eax src 111 C ebx 112 C ecx counter, limbs 113 C edx dst 114 115 movq 8(%eax,%ecx,4), %mm0 116 movq (%eax,%ecx,4), %mm1 117 subl $4, %ecx 118 movq %mm0, 16+8(%edx,%ecx,4) 119 movq %mm1, 16(%edx,%ecx,4) 120 jns L(top) 121 122 123 C now %ecx is -4 to -1 representing respectively 0 to 3 limbs remaining 124 125 testb $2, %cl 126 jz L(finish_not_two) 127 128 movq 8(%eax,%ecx,4), %mm0 129 movq %mm0, 8(%edx,%ecx,4) 130L(finish_not_two): 131 132 testb $1, %cl 133 jz L(done) 134 135 movl (%eax), %ebx 136 movl %ebx, (%edx) 137 138L(done): 139 movl SAVE_EBX, %ebx 140 emms 141 ret 142 143 144EPILOGUE() 145