1dnl AMD64 mpn_copyd optimised for CPUs with fast SSE. 2 3dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, 4dnl Inc. 5 6dnl Contributed to the GNU project by Torbjörn Granlund. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb cycles/limb cycles/limb good 37C aligned unaligned best seen for cpu? 38C AMD K8,K9 39C AMD K10 0.85 1.64 Y/N 40C AMD bull 1.4 1.4 Y 41C AMD pile 0.68 0.98 Y/N 42C AMD steam 43C AMD excavator 44C AMD bobcat 45C AMD jaguar 0.65 1.02 opt/0.93 Y/N 46C Intel P4 2.3 2.3 Y 47C Intel core 1.0 1.0 0.52/0.80 N 48C Intel NHM 0.5 0.67 Y 49C Intel SBR 0.51 0.75 opt/0.54 Y/N 50C Intel IBR 0.50 0.57 opt/0.50 Y 51C Intel HWL 0.50 0.57 opt/0.51 Y 52C Intel BWL 0.55 0.62 opt/0.55 Y 53C Intel atom 54C Intel SLM 1.02 1.27 opt/1.04 Y/N 55C VIA nano 1.16 5.16 Y/N 56 57C We try to do as many 16-byte operations as possible. The top-most and 58C bottom-most writes might need 8-byte operations. We can always write using 59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 60C operations. 61 62C Instead of having separate loops for reading aligned and unaligned, we read 63C using MOVDQU. This seems to work great except for core2; there performance 64C doubles when reading using MOVDQA (for aligned source). It is unclear how to 65C best handle the unaligned case there. 66 67C INPUT PARAMETERS 68define(`rp', `%rdi') 69define(`up', `%rsi') 70define(`n', `%rdx') 71 72ABI_SUPPORT(DOS64) 73ABI_SUPPORT(STD64) 74 75dnl define(`movdqu', lddqu) 76 77ASM_START() 78 TEXT 79 ALIGN(16) 80PROLOGUE(mpn_copyd) 81 FUNC_ENTRY(3) 82 83 test n, n 84 jz L(don) 85 86 lea -16(rp,n,8), rp 87 lea -16(up,n,8), up 88 89 test $8, R8(rp) C is rp 16-byte aligned? 90 jz L(ali) C jump if rp aligned 91 mov 8(up), %rax 92 lea -8(up), up 93 mov %rax, 8(rp) 94 lea -8(rp), rp 95 dec n 96 97L(ali): sub $16, n 98 jc L(sma) 99 100IFDOS(` add $-56, %rsp ') 101IFDOS(` movdqa %xmm6, (%rsp) ') 102IFDOS(` movdqa %xmm7, 16(%rsp) ') 103 104 ALIGN(16) 105L(top): movdqu (up), %xmm0 106 movdqu -16(up), %xmm1 107 movdqu -32(up), %xmm2 108 movdqu -48(up), %xmm3 109 movdqu -64(up), %xmm4 110 movdqu -80(up), %xmm5 111 movdqu -96(up), %xmm6 112 movdqu -112(up), %xmm7 113 lea -128(up), up 114 movdqa %xmm0, (rp) 115 movdqa %xmm1, -16(rp) 116 movdqa %xmm2, -32(rp) 117 movdqa %xmm3, -48(rp) 118 movdqa %xmm4, -64(rp) 119 movdqa %xmm5, -80(rp) 120 movdqa %xmm6, -96(rp) 121 movdqa %xmm7, -112(rp) 122 lea -128(rp), rp 123 sub $16, n 124 jnc L(top) 125 126IFDOS(` movdqa (%rsp), %xmm6 ') 127IFDOS(` movdqa 16(%rsp), %xmm7 ') 128IFDOS(` add $56, %rsp ') 129 130L(sma): test $8, R8(n) 131 jz 1f 132 movdqu (up), %xmm0 133 movdqu -16(up), %xmm1 134 movdqu -32(up), %xmm2 135 movdqu -48(up), %xmm3 136 lea -64(up), up 137 movdqa %xmm0, (rp) 138 movdqa %xmm1, -16(rp) 139 movdqa %xmm2, -32(rp) 140 movdqa %xmm3, -48(rp) 141 lea -64(rp), rp 1421: 143 test $4, R8(n) 144 jz 1f 145 movdqu (up), %xmm0 146 movdqu -16(up), %xmm1 147 lea -32(up), up 148 movdqa %xmm0, (rp) 149 movdqa %xmm1, -16(rp) 150 lea -32(rp), rp 1511: 152 test $2, R8(n) 153 jz 1f 154 movdqu (up), %xmm0 155 lea -16(up), up 156 movdqa %xmm0, (rp) 157 lea -16(rp), rp 1581: 159 test $1, R8(n) 160 jz 1f 161 mov 8(up), %r8 162 mov %r8, 8(rp) 1631: 164L(don): FUNC_EXIT() 165 ret 166EPILOGUE() 167