1dnl Intel P6 mpn_copyd -- copy limb vector backwards. 2 3dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P6: 1.75 cycles/limb, or 0.75 if no overlap 24 25 26C void mpn_copyd (mp_ptr dst, mp_srcptr src, mp_size_t size); 27C 28C An explicit loop is used because a decrementing rep movsl is a bit slow at 29C 2.4 c/l. That rep movsl also has about a 40 cycle startup time, and the 30C code here stands a chance of being faster if the branches predict well. 31C 32C The slightly strange loop form seems necessary for the claimed speed. 33C Maybe load/store ordering affects it. 34C 35C The source and destination are checked to see if they're actually 36C overlapping, since it might be possible to use an incrementing rep movsl 37C at 0.75 c/l. (It doesn't suffer the bad startup time of the decrementing 38C version.) 39C 40C Enhancements: 41C 42C Top speed for an all-integer copy is probably 1.0 c/l, being one load and 43C one store each cycle. Unrolling the loop below would approach 1.0, but 44C it'd be good to know why something like store/load/subl + store/load/jnz 45C doesn't already run at 1.0 c/l. It looks like it should decode in 2 46C cycles, but doesn't run that way. 47 48defframe(PARAM_SIZE,12) 49defframe(PARAM_SRC, 8) 50defframe(PARAM_DST, 4) 51 52dnl re-using parameter space 53define(SAVE_ESI,`PARAM_SIZE') 54define(SAVE_EDI,`PARAM_SRC') 55 56 TEXT 57 ALIGN(16) 58 59PROLOGUE(mpn_copyd) 60deflit(`FRAME',0) 61 62 movl PARAM_SIZE, %ecx 63 64 movl %esi, SAVE_ESI 65 movl PARAM_SRC, %esi 66 67 movl %edi, SAVE_EDI 68 movl PARAM_DST, %edi 69 70 subl $1, %ecx 71 jb L(zero) 72 73 movl (%esi,%ecx,4), %eax C src[size-1] 74 jz L(one) 75 76 movl -4(%esi,%ecx,4), %edx C src[size-2] 77 subl $2, %ecx 78 jbe L(done_loop) C 2 or 3 limbs only 79 80 81 C The usual overlap is 82 C 83 C high low 84 C +------------------+ 85 C | dst| 86 C +------------------+ 87 C +------------------+ 88 C | src| 89 C +------------------+ 90 C 91 C We can use an incrementing copy in the following circumstances. 92 C 93 C src+4*size<=dst, since then the regions are disjoint 94 C 95 C src==dst, clearly (though this shouldn't occur normally) 96 C 97 C src>dst, since in that case it's a requirement of the 98 C parameters that src>=dst+size*4, and hence the 99 C regions are disjoint 100 C 101 102 leal (%edi,%ecx,4), %edx 103 cmpl %edi, %esi 104 jae L(use_movsl) C src >= dst 105 106 cmpl %edi, %edx 107 movl 4(%esi,%ecx,4), %edx C src[size-2] again 108 jbe L(use_movsl) C src+4*size <= dst 109 110 111L(top): 112 C eax prev high limb 113 C ebx 114 C ecx counter, size-3 down to 0 or -1, inclusive, by 2s 115 C edx prev low limb 116 C esi src 117 C edi dst 118 C ebp 119 120 movl %eax, 8(%edi,%ecx,4) 121 movl (%esi,%ecx,4), %eax 122 123 movl %edx, 4(%edi,%ecx,4) 124 movl -4(%esi,%ecx,4), %edx 125 126 subl $2, %ecx 127 jnbe L(top) 128 129 130L(done_loop): 131 movl %eax, 8(%edi,%ecx,4) 132 movl %edx, 4(%edi,%ecx,4) 133 134 C copy low limb (needed if size was odd, but will already have been 135 C done in the loop if size was even) 136 movl (%esi), %eax 137L(one): 138 movl %eax, (%edi) 139 movl SAVE_EDI, %edi 140 movl SAVE_ESI, %esi 141 142 ret 143 144 145L(use_movsl): 146 C eax 147 C ebx 148 C ecx size-3 149 C edx 150 C esi src 151 C edi dst 152 C ebp 153 154 addl $3, %ecx 155 156 cld C better safe than sorry, see mpn/x86/README 157 158 rep 159 movsl 160 161L(zero): 162 movl SAVE_ESI, %esi 163 movl SAVE_EDI, %edi 164 165 ret 166 167EPILOGUE() 168