1dnl AMD64 mpn_com optimised for CPUs with fast SSE. 2 3dnl Copyright 2003, 2005, 2007, 2011, 2012 Free Software Foundation, Inc. 4 5dnl Contributed to the GNU project by Torbjorn Granlund. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb cycles/limb cycles/limb good 25C aligned unaligned best seen for cpu? 26C AMD K8,K9 2.0 2.0 N 27C AMD K10 0.85 1.3 Y/N 28C AMD bd1 1.40 1.40 Y 29C AMD bobcat 3.1 3.1 N 30C Intel P4 2.28 illop Y 31C Intel core2 1.02 1.02 N 32C Intel NHM 0.53 0.68 Y 33C Intel SBR 0.51 0.75 Y 34C Intel atom 3.68 3.68 N 35C VIA nano 1.17 5.09 Y/N 36 37C We try to do as many 16-byte operations as possible. The top-most and 38C bottom-most writes might need 8-byte operations. We can always write using 39C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 40C operations. 41 42C Instead of having separate loops for reading aligned and unaligned, we read 43C using MOVDQU. This seems to work great except for core2; there performance 44C doubles when reading using MOVDQA (for aligned source). It is unclear how to 45C best handle the unaligned case there. 46 47C INPUT PARAMETERS 48define(`rp', `%rdi') 49define(`up', `%rsi') 50define(`n', `%rdx') 51 52ABI_SUPPORT(DOS64) 53ABI_SUPPORT(STD64) 54 55ASM_START() 56 TEXT 57 ALIGN(16) 58PROLOGUE(mpn_com) 59 FUNC_ENTRY(3) 60 61 test n, n 62 jz L(don) 63 64 pcmpeqb %xmm7, %xmm7 C set to 111...111 65 66 test $8, R8(rp) C is rp 16-byte aligned? 67 jz L(ali) C jump if rp aligned 68 mov (up), %rax 69 lea 8(up), up 70 not %rax 71 mov %rax, (rp) 72 lea 8(rp), rp 73 dec n 74 75 sub $14, n 76 jc L(sma) 77 78 ALIGN(16) 79L(top): movdqu (up), %xmm0 80 movdqu 16(up), %xmm1 81 movdqu 32(up), %xmm2 82 movdqu 48(up), %xmm3 83 movdqu 64(up), %xmm4 84 movdqu 80(up), %xmm5 85 movdqu 96(up), %xmm6 86 lea 112(up), up 87 pxor %xmm7, %xmm0 88 pxor %xmm7, %xmm1 89 pxor %xmm7, %xmm2 90 pxor %xmm7, %xmm3 91 pxor %xmm7, %xmm4 92 pxor %xmm7, %xmm5 93 pxor %xmm7, %xmm6 94 movdqa %xmm0, (rp) 95 movdqa %xmm1, 16(rp) 96 movdqa %xmm2, 32(rp) 97 movdqa %xmm3, 48(rp) 98 movdqa %xmm4, 64(rp) 99 movdqa %xmm5, 80(rp) 100 movdqa %xmm6, 96(rp) 101 lea 112(rp), rp 102L(ali): sub $14, n 103 jnc L(top) 104 105L(sma): add $14, n 106 test $8, R8(n) 107 jz 1f 108 movdqu (up), %xmm0 109 movdqu 16(up), %xmm1 110 movdqu 32(up), %xmm2 111 movdqu 48(up), %xmm3 112 lea 64(up), up 113 pxor %xmm7, %xmm0 114 pxor %xmm7, %xmm1 115 pxor %xmm7, %xmm2 116 pxor %xmm7, %xmm3 117 movdqa %xmm0, (rp) 118 movdqa %xmm1, 16(rp) 119 movdqa %xmm2, 32(rp) 120 movdqa %xmm3, 48(rp) 121 lea 64(rp), rp 1221: 123 test $4, R8(n) 124 jz 1f 125 movdqu (up), %xmm0 126 movdqu 16(up), %xmm1 127 lea 32(up), up 128 pxor %xmm7, %xmm0 129 pxor %xmm7, %xmm1 130 movdqa %xmm0, (rp) 131 movdqa %xmm1, 16(rp) 132 lea 32(rp), rp 1331: 134 test $2, R8(n) 135 jz 1f 136 movdqu (up), %xmm0 137 lea 16(up), up 138 pxor %xmm7, %xmm0 139 movdqa %xmm0, (rp) 140 lea 16(rp), rp 1411: 142 test $1, R8(n) 143 jz 1f 144 mov (up), %rax 145 not %rax 146 mov %rax, (rp) 1471: 148L(don): FUNC_EXIT() 149 ret 150EPILOGUE() 151