1dnl AMD64 mpn_copyi optimised for CPUs with fast SSE. 2 3dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C cycles/limb good for cpu? 24C AMD K8,K9 25C AMD K10 0.85 1.64 Y/N 26C AMD bd1 1.4 1.4 Y 27C AMD bobcat 28C Intel P4 2.3 2.3 Y 29C Intel core2 1.0 1.0 30C Intel NHM 0.5 0.67 Y 31C Intel SBR 0.5 0.75 Y 32C Intel atom 33C VIA nano 1.16 5.16 Y/N 34 35C We try to do as many 16-byte operations as possible. The top-most and 36C bottom-most writes might need 8-byte operations. We can always write using 37C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 38C operations. 39 40C Instead of having separate loops for reading aligned and unaligned, we read 41C using MOVDQU. This seems to work great except for core2; there performance 42C doubles when reading using MOVDQA (for aligned source). It is unclear how to 43C best handle the unaligned case there. 44 45C INPUT PARAMETERS 46define(`rp', `%rdi') 47define(`up', `%rsi') 48define(`n', `%rdx') 49 50ABI_SUPPORT(DOS64) 51ABI_SUPPORT(STD64) 52 53dnl define(`movdqu', lddqu) 54 55ASM_START() 56 TEXT 57 ALIGN(64) 58PROLOGUE(mpn_copyi) 59 FUNC_ENTRY(3) 60 61 cmp $3, n 62 jc L(bc) 63 64 test $8, R8(rp) C is rp 16-byte aligned? 65 jz L(ali) C jump if rp aligned 66 movsq C copy single limb 67 dec n 68 69 sub $16, n 70 jc L(sma) 71 72 ALIGN(16) 73L(top): movdqu (up), %xmm0 74 movdqu 16(up), %xmm1 75 movdqu 32(up), %xmm2 76 movdqu 48(up), %xmm3 77 movdqu 64(up), %xmm4 78 movdqu 80(up), %xmm5 79 movdqu 96(up), %xmm6 80 movdqu 112(up), %xmm7 81 lea 128(up), up 82 movdqa %xmm0, (rp) 83 movdqa %xmm1, 16(rp) 84 movdqa %xmm2, 32(rp) 85 movdqa %xmm3, 48(rp) 86 movdqa %xmm4, 64(rp) 87 movdqa %xmm5, 80(rp) 88 movdqa %xmm6, 96(rp) 89 movdqa %xmm7, 112(rp) 90 lea 128(rp), rp 91L(ali): sub $16, n 92 jnc L(top) 93 94L(sma): test $8, R8(n) 95 jz 1f 96 movdqu (up), %xmm0 97 movdqu 16(up), %xmm1 98 movdqu 32(up), %xmm2 99 movdqu 48(up), %xmm3 100 lea 64(up), up 101 movdqa %xmm0, (rp) 102 movdqa %xmm1, 16(rp) 103 movdqa %xmm2, 32(rp) 104 movdqa %xmm3, 48(rp) 105 lea 64(rp), rp 1061: 107 test $4, R8(n) 108 jz 1f 109 movdqu (up), %xmm0 110 movdqu 16(up), %xmm1 111 lea 32(up), up 112 movdqa %xmm0, (rp) 113 movdqa %xmm1, 16(rp) 114 lea 32(rp), rp 1151: 116 test $2, R8(n) 117 jz 1f 118 movdqu (up), %xmm0 119 lea 16(up), up 120 movdqa %xmm0, (rp) 121 lea 16(rp), rp 122 ALIGN(16) 1231: 124L(end): bt $0, n 125 jnc 1f 126 mov (up), %r8 127 mov %r8, (rp) 1281: 129 FUNC_EXIT() 130 ret 131 132C Basecase code. Needed for good small operands speed, not for 133C correctness as the above code is currently written. 134 135L(bc): sub $2, n 136 jc L(end) 137 ALIGN(16) 1381: mov (up), %rax 139 mov 8(up), %rcx 140 lea 16(up), up 141 mov %rax, (rp) 142 mov %rcx, 8(rp) 143 lea 16(rp), rp 144 sub $2, n 145 jnc 1b 146 147 bt $0, n 148 jnc L(ret) 149 mov (up), %rax 150 mov %rax, (rp) 151L(ret): FUNC_EXIT() 152 ret 153EPILOGUE() 154