1dnl AMD64 mpn_copyd optimised for CPUs with fast SSE copying and SSSE3. 2 3dnl Copyright 2012 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb cycles/limb cycles/limb good 25C aligned unaligned best seen for cpu? 26C AMD K8,K9 2.0 illop 1.0/1.0 N 27C AMD K10 0.85 illop Y/N 28C AMD bd1 1.39 1.40 Y 29C AMD bobcat 1.97 8.35 1.5/1.5 N 30C Intel P4 2.26 illop Y/N 31C Intel core2 0.52 0.68-0.80 opt/0.68 Y 32C Intel NHM 0.52 0.64 opt/opt Y 33C Intel SBR 0.51 0.54 opt/0.51 Y 34C Intel atom 1.16 1.66 opt/opt Y 35C VIA nano 1.09 1.07 opt/opt Y 36 37C We use only 16-byte operations, except for unaligned top-most and bottom-most 38C limbs. We use the SSSE3 palignr instruction when rp - up = 8 (mod 16). 39C 40C For operands of < COPYD_SSE_THRESHOLD limbs, we use a plain 64-bit loop, 41C taken from the x86_64 default code. 42 43C INPUT PARAMETERS 44define(`rp', `%rdi') 45define(`up', `%rsi') 46define(`n', `%rdx') 47 48C There are three instructions for loading an aligned 128-bit quantity. We use 49C movaps, since it has the shortest coding. 50define(`movdqa', ``movaps'') 51 52ifdef(`COPYD_SSE_THRESHOLD',`',`define(`COPYD_SSE_THRESHOLD', 7)') 53 54ASM_START() 55 TEXT 56 ALIGN(64) 57PROLOGUE(mpn_copyd) 58 FUNC_ENTRY(3) 59 60 lea -8(up,n,8), up 61 lea -8(rp,n,8), rp 62 63 cmp $COPYD_SSE_THRESHOLD, n 64 jbe L(bc) 65 66 bt $3, R32(rp) C is rp 16-byte aligned? 67 jc L(rp_aligned) C jump if rp aligned 68 69 mov (up), %rax C copy one limb 70 mov %rax, (rp) 71 lea -8(up), up 72 lea -8(rp), rp 73 dec n 74 75L(rp_aligned): 76 bt $3, R32(up) 77 jnc L(uent) 78 79ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 80` sub $8, n', 81` jmp L(am)') 82 83 ALIGN(16) 84L(atop):movdqa -8(up), %xmm0 85 movdqa -24(up), %xmm1 86 movdqa -40(up), %xmm2 87 movdqa -56(up), %xmm3 88 lea -64(up), up 89 movdqa %xmm0, -8(rp) 90 movdqa %xmm1, -24(rp) 91 movdqa %xmm2, -40(rp) 92 movdqa %xmm3, -56(rp) 93 lea -64(rp), rp 94L(am): sub $8, n 95 jnc L(atop) 96 97 bt $2, R32(n) 98 jnc 1f 99 movdqa -8(up), %xmm0 100 movdqa -24(up), %xmm1 101 lea -32(up), up 102 movdqa %xmm0, -8(rp) 103 movdqa %xmm1, -24(rp) 104 lea -32(rp), rp 105 1061: bt $1, R32(n) 107 jnc 1f 108 movdqa -8(up), %xmm0 109 lea -16(up), up 110 movdqa %xmm0, -8(rp) 111 lea -16(rp), rp 112 1131: bt $0, n 114 jnc 1f 115 mov (up), %r8 116 mov %r8, (rp) 117 1181: FUNC_EXIT() 119 ret 120 121L(uent):sub $16, n 122 movdqa (up), %xmm0 123 jc L(uend) 124 125 ALIGN(16) 126L(utop):sub $16, n 127 movdqa -16(up), %xmm1 128 palignr($8, %xmm1, %xmm0) 129 movdqa %xmm0, -8(rp) 130 movdqa -32(up), %xmm2 131 palignr($8, %xmm2, %xmm1) 132 movdqa %xmm1, -24(rp) 133 movdqa -48(up), %xmm3 134 palignr($8, %xmm3, %xmm2) 135 movdqa %xmm2, -40(rp) 136 movdqa -64(up), %xmm0 137 palignr($8, %xmm0, %xmm3) 138 movdqa %xmm3, -56(rp) 139 movdqa -80(up), %xmm1 140 palignr($8, %xmm1, %xmm0) 141 movdqa %xmm0, -72(rp) 142 movdqa -96(up), %xmm2 143 palignr($8, %xmm2, %xmm1) 144 movdqa %xmm1, -88(rp) 145 movdqa -112(up), %xmm3 146 palignr($8, %xmm3, %xmm2) 147 movdqa %xmm2, -104(rp) 148 movdqa -128(up), %xmm0 149 palignr($8, %xmm0, %xmm3) 150 movdqa %xmm3, -120(rp) 151 lea -128(up), up 152 lea -128(rp), rp 153 jnc L(utop) 154 155L(uend):bt $3, R32(n) 156 jnc 1f 157 movdqa -16(up), %xmm1 158 palignr($8, %xmm1, %xmm0) 159 movdqa %xmm0, -8(rp) 160 movdqa -32(up), %xmm0 161 palignr($8, %xmm0, %xmm1) 162 movdqa %xmm1, -24(rp) 163 movdqa -48(up), %xmm1 164 palignr($8, %xmm1, %xmm0) 165 movdqa %xmm0, -40(rp) 166 movdqa -64(up), %xmm0 167 palignr($8, %xmm0, %xmm1) 168 movdqa %xmm1, -56(rp) 169 lea -64(up), up 170 lea -64(rp), rp 171 1721: bt $2, R32(n) 173 jnc 1f 174 movdqa -16(up), %xmm1 175 palignr($8, %xmm1, %xmm0) 176 movdqa %xmm0, -8(rp) 177 movdqa -32(up), %xmm0 178 palignr($8, %xmm0, %xmm1) 179 movdqa %xmm1, -24(rp) 180 lea -32(up), up 181 lea -32(rp), rp 182 1831: bt $1, R32(n) 184 jnc 1f 185 movdqa -16(up), %xmm1 186 palignr($8, %xmm1, %xmm0) 187 movdqa %xmm0, -8(rp) 188 lea -16(up), up 189 lea -16(rp), rp 190 1911: bt $0, n 192 jnc 1f 193 mov (up), %r8 194 mov %r8, (rp) 195 1961: FUNC_EXIT() 197 ret 198 199C Basecase code. Needed for good small operands speed, not for 200C correctness as the above code is currently written. 201 202L(bc): sub $4, R32(n) 203 jc L(end) 204 205 ALIGN(16) 206L(top): mov (up), %r8 207 mov -8(up), %r9 208 lea -32(rp), rp 209 mov -16(up), %r10 210 mov -24(up), %r11 211 lea -32(up), up 212 mov %r8, 32(rp) 213 mov %r9, 24(rp) 214ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 215` sub $4, R32(n)') 216 mov %r10, 16(rp) 217 mov %r11, 8(rp) 218ifelse(eval(COPYD_SSE_THRESHOLD >= 8),1, 219` jnc L(top)') 220 221L(end): bt $0, R32(n) 222 jnc 1f 223 mov (up), %r8 224 mov %r8, (rp) 225 lea -8(rp), rp 226 lea -8(up), up 2271: bt $1, R32(n) 228 jnc 1f 229 mov (up), %r8 230 mov -8(up), %r9 231 mov %r8, (rp) 232 mov %r9, -8(rp) 2331: FUNC_EXIT() 234 ret 235EPILOGUE() 236