1dnl AMD64 mpn_com optimised for CPUs with fast SSE. 2 3dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, 4dnl Inc. 5 6dnl Contributed to the GNU project by Torbjorn Granlund. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb cycles/limb cycles/limb good 37C aligned unaligned best seen for cpu? 38C AMD K8,K9 2.0 2.0 N 39C AMD K10 0.85 1.3 Y/N 40C AMD bull 1.40 1.40 Y 41C AMD pile 0.9-1.4 0.9-1.4 Y 42C AMD steam 43C AMD excavator 44C AMD bobcat 3.1 3.1 N 45C AMD jaguar 0.91 0.91 opt/opt Y 46C Intel P4 2.28 illop Y 47C Intel core2 1.02 1.02 N 48C Intel NHM 0.53 0.68 Y 49C Intel SBR 0.51 0.75 opt/0.65 Y/N 50C Intel IBR 0.50 0.57 opt/opt Y 51C Intel HWL 0.51 0.64 opt/0.58 Y 52C Intel BWL 0.61 0.65 0.57/opt Y 53C Intel atom 3.68 3.68 N 54C Intel SLM 1.09 1.35 N 55C VIA nano 1.17 5.09 Y/N 56 57C We try to do as many 16-byte operations as possible. The top-most and 58C bottom-most writes might need 8-byte operations. We can always write using 59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 60C operations. 61 62C Instead of having separate loops for reading aligned and unaligned, we read 63C using MOVDQU. This seems to work great except for core2; there performance 64C doubles when reading using MOVDQA (for aligned source). It is unclear how to 65C best handle the unaligned case there. 66 67C INPUT PARAMETERS 68define(`rp', `%rdi') 69define(`up', `%rsi') 70define(`n', `%rdx') 71 72ABI_SUPPORT(DOS64) 73ABI_SUPPORT(STD64) 74 75ASM_START() 76 TEXT 77 ALIGN(16) 78PROLOGUE(mpn_com) 79 FUNC_ENTRY(3) 80 81IFDOS(` add $-56, %rsp ') 82IFDOS(` movdqa %xmm6, (%rsp) ') 83IFDOS(` movdqa %xmm7, 16(%rsp) ') 84 85 pcmpeqb %xmm7, %xmm7 C set to 111...111 86 87 test $8, R8(rp) C is rp 16-byte aligned? 88 jz L(ali) C jump if rp aligned 89 mov (up), %rax 90 lea 8(up), up 91 not %rax 92 mov %rax, (rp) 93 lea 8(rp), rp 94 dec n 95 96 sub $14, n 97 jc L(sma) 98 99 ALIGN(16) 100L(top): movdqu (up), %xmm0 101 movdqu 16(up), %xmm1 102 movdqu 32(up), %xmm2 103 movdqu 48(up), %xmm3 104 movdqu 64(up), %xmm4 105 movdqu 80(up), %xmm5 106 movdqu 96(up), %xmm6 107 lea 112(up), up 108 pxor %xmm7, %xmm0 109 pxor %xmm7, %xmm1 110 pxor %xmm7, %xmm2 111 pxor %xmm7, %xmm3 112 pxor %xmm7, %xmm4 113 pxor %xmm7, %xmm5 114 pxor %xmm7, %xmm6 115 movdqa %xmm0, (rp) 116 movdqa %xmm1, 16(rp) 117 movdqa %xmm2, 32(rp) 118 movdqa %xmm3, 48(rp) 119 movdqa %xmm4, 64(rp) 120 movdqa %xmm5, 80(rp) 121 movdqa %xmm6, 96(rp) 122 lea 112(rp), rp 123L(ali): sub $14, n 124 jnc L(top) 125 126L(sma): add $14, n 127 test $8, R8(n) 128 jz 1f 129 movdqu (up), %xmm0 130 movdqu 16(up), %xmm1 131 movdqu 32(up), %xmm2 132 movdqu 48(up), %xmm3 133 lea 64(up), up 134 pxor %xmm7, %xmm0 135 pxor %xmm7, %xmm1 136 pxor %xmm7, %xmm2 137 pxor %xmm7, %xmm3 138 movdqa %xmm0, (rp) 139 movdqa %xmm1, 16(rp) 140 movdqa %xmm2, 32(rp) 141 movdqa %xmm3, 48(rp) 142 lea 64(rp), rp 1431: 144 test $4, R8(n) 145 jz 1f 146 movdqu (up), %xmm0 147 movdqu 16(up), %xmm1 148 lea 32(up), up 149 pxor %xmm7, %xmm0 150 pxor %xmm7, %xmm1 151 movdqa %xmm0, (rp) 152 movdqa %xmm1, 16(rp) 153 lea 32(rp), rp 1541: 155 test $2, R8(n) 156 jz 1f 157 movdqu (up), %xmm0 158 lea 16(up), up 159 pxor %xmm7, %xmm0 160 movdqa %xmm0, (rp) 161 lea 16(rp), rp 1621: 163 test $1, R8(n) 164 jz 1f 165 mov (up), %rax 166 not %rax 167 mov %rax, (rp) 1681: 169L(don): 170IFDOS(` movdqa (%rsp), %xmm6 ') 171IFDOS(` movdqa 16(%rsp), %xmm7 ') 172IFDOS(` add $56, %rsp ') 173 FUNC_EXIT() 174 ret 175EPILOGUE() 176