1dnl AMD64 mpn_copyi optimised for CPUs with fast SSE copying and SSSE3. 2 3dnl Copyright 2012 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb cycles/limb cycles/limb good 25C aligned unaligned best seen for cpu? 26C AMD K8,K9 2.0 illop 1.0/1.0 N 27C AMD K10 0.85 illop Y/N 28C AMD bd1 1.39 ? 1.45 Y/N 29C AMD bobcat 1.97 ? 8.17 1.5/1.5 N 30C Intel P4 2.26 illop Y/N 31C Intel core2 0.52 0.82 opt/0.74 Y 32C Intel NHM 0.52 0.65 opt/opt Y 33C Intel SBR 0.51 0.55 opt/0.51 Y 34C Intel atom 1.16 1.70 opt/opt Y 35C VIA nano 1.09 1.10 opt/opt Y 36 37C We use only 16-byte operations, except for unaligned top-most and bottom-most 38C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). That 39C instruction is better adapted to mpn_copyd's needs, we need to contort the 40C code to use it here. 41C 42C For operands of < COPYI_SSE_THRESHOLD limbs, we use a plain 64-bit loop, 43C taken from the x86_64 default code. 44 45C INPUT PARAMETERS 46define(`rp', `%rdi') 47define(`up', `%rsi') 48define(`n', `%rdx') 49 50C There are three instructions for loading an aligned 128-bit quantity. We use 51C movaps, since it has the shortest coding. 52define(`movdqa', ``movaps'') 53 54ifdef(`COPYI_SSE_THRESHOLD',`',`define(`COPYI_SSE_THRESHOLD', 7)') 55 56ASM_START() 57 TEXT 58 ALIGN(64) 59PROLOGUE(mpn_copyi) 60 FUNC_ENTRY(3) 61 62 cmp $COPYI_SSE_THRESHOLD, n 63 jbe L(bc) 64 65 bt $3, R32(rp) C is rp 16-byte aligned? 66 jnc L(rp_aligned) C jump if rp aligned 67 68 movsq C copy one limb 69 dec n 70 71L(rp_aligned): 72 bt $3, R32(up) 73 jc L(uent) 74 75ifelse(eval(COPYI_SSE_THRESHOLD >= 8),1, 76` sub $8, n', 77` jmp L(am)') 78 79 ALIGN(16) 80L(atop):movdqa 0(up), %xmm0 81 movdqa 16(up), %xmm1 82 movdqa 32(up), %xmm2 83 movdqa 48(up), %xmm3 84 lea 64(up), up 85 movdqa %xmm0, (rp) 86 movdqa %xmm1, 16(rp) 87 movdqa %xmm2, 32(rp) 88 movdqa %xmm3, 48(rp) 89 lea 64(rp), rp 90L(am): sub $8, n 91 jnc L(atop) 92 93 bt $2, R32(n) 94 jnc 1f 95 movdqa (up), %xmm0 96 movdqa 16(up), %xmm1 97 lea 32(up), up 98 movdqa %xmm0, (rp) 99 movdqa %xmm1, 16(rp) 100 lea 32(rp), rp 101 1021: bt $1, R32(n) 103 jnc 1f 104 movdqa (up), %xmm0 105 lea 16(up), up 106 movdqa %xmm0, (rp) 107 lea 16(rp), rp 108 1091: bt $0, n 110 jnc 1f 111 mov (up), %r8 112 mov %r8, (rp) 113 1141: FUNC_EXIT() 115 ret 116 117L(uent): 118C Code handling up - rp = 8 (mod 16) 119 120C FIXME: The code below only handles overlap if it is close to complete, or 121C quite separate: up-rp < 5 or up-up > 15 limbs 122 lea -40(up), %rax C 40 = 5 * GMP_LIMB_BYTES 123 sub rp, %rax 124 cmp $80, %rax C 80 = (15-5) * GMP_LIMB_BYTES 125 jbe L(bc) C deflect to plain loop 126 127 sub $16, n 128 jc L(uend) 129 130 movdqa 120(up), %xmm3 131 132 sub $16, n 133 jmp L(um) 134 135 ALIGN(16) 136L(utop):movdqa 120(up), %xmm3 137 movdqa %xmm0, -128(rp) 138 sub $16, n 139L(um): movdqa 104(up), %xmm2 140 palignr($8, %xmm2, %xmm3) 141 movdqa 88(up), %xmm1 142 movdqa %xmm3, 112(rp) 143 palignr($8, %xmm1, %xmm2) 144 movdqa 72(up), %xmm0 145 movdqa %xmm2, 96(rp) 146 palignr($8, %xmm0, %xmm1) 147 movdqa 56(up), %xmm3 148 movdqa %xmm1, 80(rp) 149 palignr($8, %xmm3, %xmm0) 150 movdqa 40(up), %xmm2 151 movdqa %xmm0, 64(rp) 152 palignr($8, %xmm2, %xmm3) 153 movdqa 24(up), %xmm1 154 movdqa %xmm3, 48(rp) 155 palignr($8, %xmm1, %xmm2) 156 movdqa 8(up), %xmm0 157 movdqa %xmm2, 32(rp) 158 palignr($8, %xmm0, %xmm1) 159 movdqa -8(up), %xmm3 160 movdqa %xmm1, 16(rp) 161 palignr($8, %xmm3, %xmm0) 162 lea 128(up), up 163 lea 128(rp), rp 164 jnc L(utop) 165 166 movdqa %xmm0, -128(rp) 167 168L(uend):bt $3, R32(n) 169 jnc 1f 170 movdqa 56(up), %xmm3 171 movdqa 40(up), %xmm2 172 palignr($8, %xmm2, %xmm3) 173 movdqa 24(up), %xmm1 174 movdqa %xmm3, 48(rp) 175 palignr($8, %xmm1, %xmm2) 176 movdqa 8(up), %xmm0 177 movdqa %xmm2, 32(rp) 178 palignr($8, %xmm0, %xmm1) 179 movdqa -8(up), %xmm3 180 movdqa %xmm1, 16(rp) 181 palignr($8, %xmm3, %xmm0) 182 lea 64(up), up 183 movdqa %xmm0, (rp) 184 lea 64(rp), rp 185 1861: bt $2, R32(n) 187 jnc 1f 188 movdqa 24(up), %xmm1 189 movdqa 8(up), %xmm0 190 palignr($8, %xmm0, %xmm1) 191 movdqa -8(up), %xmm3 192 movdqa %xmm1, 16(rp) 193 palignr($8, %xmm3, %xmm0) 194 lea 32(up), up 195 movdqa %xmm0, (rp) 196 lea 32(rp), rp 197 1981: bt $1, R32(n) 199 jnc 1f 200 movdqa 8(up), %xmm0 201 movdqa -8(up), %xmm3 202 palignr($8, %xmm3, %xmm0) 203 lea 16(up), up 204 movdqa %xmm0, (rp) 205 lea 16(rp), rp 206 2071: bt $0, n 208 jnc 1f 209 mov (up), %r8 210 mov %r8, (rp) 211 2121: FUNC_EXIT() 213 ret 214 215C Basecase code. Needed for good small operands speed, not for 216C correctness as the above code is currently written. 217 218L(bc): lea -8(rp), rp 219 sub $4, R32(n) 220 jc L(end) 221 222 ALIGN(16) 223L(top): mov (up), %r8 224 mov 8(up), %r9 225 lea 32(rp), rp 226 mov 16(up), %r10 227 mov 24(up), %r11 228 lea 32(up), up 229 mov %r8, -24(rp) 230 mov %r9, -16(rp) 231ifelse(eval(1 || COPYI_SSE_THRESHOLD >= 8),1, 232` sub $4, R32(n)') 233 mov %r10, -8(rp) 234 mov %r11, (rp) 235ifelse(eval(1 || COPYI_SSE_THRESHOLD >= 8),1, 236` jnc L(top)') 237 238L(end): bt $0, R32(n) 239 jnc 1f 240 mov (up), %r8 241 mov %r8, 8(rp) 242 lea 8(rp), rp 243 lea 8(up), up 2441: bt $1, R32(n) 245 jnc 1f 246 mov (up), %r8 247 mov 8(up), %r9 248 mov %r8, 8(rp) 249 mov %r9, 16(rp) 2501: FUNC_EXIT() 251 ret 252EPILOGUE() 253