1dnl AMD64 mpn_copyd optimised for CPUs with fast SSE. 2 3dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb good for cpu? 24C AMD K8,K9 25C AMD K10 0.85 Y 26C AMD bd1 0.8 Y 27C AMD bobcat 28C Intel P4 2.28 Y 29C Intel core2 1 30C Intel NHM 0.5 Y 31C Intel SBR 0.5 Y 32C Intel atom 33C VIA nano 1.1 Y 34 35C We try to do as many 16-byte operations as possible. The top-most and 36C bottom-most writes might need 8-byte operations. We can always write using 37C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 38C operations. 39 40C Instead of having separate loops for reading aligned and unaligned, we read 41C using MOVDQU. This seems to work great except for core2; there performance 42C doubles when reading using MOVDQA (for aligned source). It is unclear how to 43C best handle the unaligned case there. 44 45C INPUT PARAMETERS 46define(`rp', `%rdi') 47define(`up', `%rsi') 48define(`n', `%rdx') 49 50ABI_SUPPORT(DOS64) 51ABI_SUPPORT(STD64) 52 53ASM_START() 54 TEXT 55 ALIGN(16) 56PROLOGUE(mpn_copyd) 57 FUNC_ENTRY(3) 58 59 test n, n 60 jz L(don) 61 62 lea -16(rp,n,8), rp 63 lea -16(up,n,8), up 64 65 test $8, R8(rp) C is rp 16-byte aligned? 66 jz L(ali) C jump if rp aligned 67 mov 8(up), %rax 68 lea -8(up), up 69 mov %rax, 8(rp) 70 lea -8(rp), rp 71 dec n 72 73 sub $16, n 74 jc L(sma) 75 76 ALIGN(16) 77L(top): movdqu (up), %xmm0 78 movdqu -16(up), %xmm1 79 movdqu -32(up), %xmm2 80 movdqu -48(up), %xmm3 81 movdqu -64(up), %xmm4 82 movdqu -80(up), %xmm5 83 movdqu -96(up), %xmm6 84 movdqu -112(up), %xmm7 85 lea -128(up), up 86 movdqa %xmm0, (rp) 87 movdqa %xmm1, -16(rp) 88 movdqa %xmm2, -32(rp) 89 movdqa %xmm3, -48(rp) 90 movdqa %xmm4, -64(rp) 91 movdqa %xmm5, -80(rp) 92 movdqa %xmm6, -96(rp) 93 movdqa %xmm7, -112(rp) 94 lea -128(rp), rp 95L(ali): sub $16, n 96 jnc L(top) 97 98L(sma): test $8, R8(n) 99 jz 1f 100 movdqu (up), %xmm0 101 movdqu -16(up), %xmm1 102 movdqu -32(up), %xmm2 103 movdqu -48(up), %xmm3 104 lea -64(up), up 105 movdqa %xmm0, (rp) 106 movdqa %xmm1, -16(rp) 107 movdqa %xmm2, -32(rp) 108 movdqa %xmm3, -48(rp) 109 lea -64(rp), rp 1101: 111 test $4, R8(n) 112 jz 1f 113 movdqu (up), %xmm0 114 movdqu -16(up), %xmm1 115 lea -32(up), up 116 movdqa %xmm0, (rp) 117 movdqa %xmm1, -16(rp) 118 lea -32(rp), rp 1191: 120 test $2, R8(n) 121 jz 1f 122 movdqu (up), %xmm0 123 lea -16(up), up 124 movdqa %xmm0, (rp) 125 lea -16(rp), rp 1261: 127 test $1, R8(n) 128 jz 1f 129 mov 8(up), %r8 130 mov %r8, 8(rp) 1311: 132L(don): FUNC_EXIT() 133 ret 134EPILOGUE() 135