1dnl AMD64 mpn_com optimised for CPUs with fast SSE. 2 3dnl Copyright 2003, 2005, 2007, 2011, 2012, 2015 Free Software Foundation, 4dnl Inc. 5 6dnl Contributed to the GNU project by Torbjorn Granlund. 7 8dnl This file is part of the GNU MP Library. 9dnl 10dnl The GNU MP Library is free software; you can redistribute it and/or modify 11dnl it under the terms of either: 12dnl 13dnl * the GNU Lesser General Public License as published by the Free 14dnl Software Foundation; either version 3 of the License, or (at your 15dnl option) any later version. 16dnl 17dnl or 18dnl 19dnl * the GNU General Public License as published by the Free Software 20dnl Foundation; either version 2 of the License, or (at your option) any 21dnl later version. 22dnl 23dnl or both in parallel, as here. 24dnl 25dnl The GNU MP Library is distributed in the hope that it will be useful, but 26dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 27dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 28dnl for more details. 29dnl 30dnl You should have received copies of the GNU General Public License and the 31dnl GNU Lesser General Public License along with the GNU MP Library. If not, 32dnl see https://www.gnu.org/licenses/. 33 34include(`../config.m4') 35 36C cycles/limb cycles/limb cycles/limb good 37C aligned unaligned best seen for cpu? 38C AMD K8,K9 2.0 2.0 N 39C AMD K10 0.85 1.3 Y/N 40C AMD bull 1.40 1.40 Y 41C AMD pile 0.9-1.4 0.9-1.4 Y 42C AMD steam 43C AMD excavator 44C AMD bobcat 3.1 3.1 N 45C AMD jaguar 0.91 0.91 opt/opt Y 46C Intel P4 2.28 illop Y 47C Intel core2 1.02 1.02 N 48C Intel NHM 0.53 0.68 Y 49C Intel SBR 0.51 0.75 opt/0.65 Y/N 50C Intel IBR 0.50 0.57 opt/opt Y 51C Intel HWL 0.51 0.64 opt/0.58 Y 52C Intel BWL 0.61 0.65 0.57/opt Y 53C Intel atom 3.68 3.68 N 54C Intel SLM 1.09 1.35 N 55C VIA nano 1.17 5.09 Y/N 56 57C We try to do as many 16-byte operations as possible. The top-most and 58C bottom-most writes might need 8-byte operations. We can always write using 59C aligned 16-byte operations, we read with both aligned and unaligned 16-byte 60C operations. 61 62C Instead of having separate loops for reading aligned and unaligned, we read 63C using MOVDQU. This seems to work great except for core2; there performance 64C doubles when reading using MOVDQA (for aligned source). It is unclear how to 65C best handle the unaligned case there. 66 67C INPUT PARAMETERS 68define(`rp', `%rdi') 69define(`up', `%rsi') 70define(`n', `%rdx') 71 72ABI_SUPPORT(DOS64) 73ABI_SUPPORT(STD64) 74 75ASM_START() 76 TEXT 77 ALIGN(16) 78PROLOGUE(mpn_com) 79 FUNC_ENTRY(3) 80 81 pcmpeqb %xmm7, %xmm7 C set to 111...111 82 83 test $8, R8(rp) C is rp 16-byte aligned? 84 jz L(ali) C jump if rp aligned 85 mov (up), %rax 86 lea 8(up), up 87 not %rax 88 mov %rax, (rp) 89 lea 8(rp), rp 90 dec n 91 92 sub $14, n 93 jc L(sma) 94 95 ALIGN(16) 96L(top): movdqu (up), %xmm0 97 movdqu 16(up), %xmm1 98 movdqu 32(up), %xmm2 99 movdqu 48(up), %xmm3 100 movdqu 64(up), %xmm4 101 movdqu 80(up), %xmm5 102 movdqu 96(up), %xmm6 103 lea 112(up), up 104 pxor %xmm7, %xmm0 105 pxor %xmm7, %xmm1 106 pxor %xmm7, %xmm2 107 pxor %xmm7, %xmm3 108 pxor %xmm7, %xmm4 109 pxor %xmm7, %xmm5 110 pxor %xmm7, %xmm6 111 movdqa %xmm0, (rp) 112 movdqa %xmm1, 16(rp) 113 movdqa %xmm2, 32(rp) 114 movdqa %xmm3, 48(rp) 115 movdqa %xmm4, 64(rp) 116 movdqa %xmm5, 80(rp) 117 movdqa %xmm6, 96(rp) 118 lea 112(rp), rp 119L(ali): sub $14, n 120 jnc L(top) 121 122L(sma): add $14, n 123 test $8, R8(n) 124 jz 1f 125 movdqu (up), %xmm0 126 movdqu 16(up), %xmm1 127 movdqu 32(up), %xmm2 128 movdqu 48(up), %xmm3 129 lea 64(up), up 130 pxor %xmm7, %xmm0 131 pxor %xmm7, %xmm1 132 pxor %xmm7, %xmm2 133 pxor %xmm7, %xmm3 134 movdqa %xmm0, (rp) 135 movdqa %xmm1, 16(rp) 136 movdqa %xmm2, 32(rp) 137 movdqa %xmm3, 48(rp) 138 lea 64(rp), rp 1391: 140 test $4, R8(n) 141 jz 1f 142 movdqu (up), %xmm0 143 movdqu 16(up), %xmm1 144 lea 32(up), up 145 pxor %xmm7, %xmm0 146 pxor %xmm7, %xmm1 147 movdqa %xmm0, (rp) 148 movdqa %xmm1, 16(rp) 149 lea 32(rp), rp 1501: 151 test $2, R8(n) 152 jz 1f 153 movdqu (up), %xmm0 154 lea 16(up), up 155 pxor %xmm7, %xmm0 156 movdqa %xmm0, (rp) 157 lea 16(rp), rp 1581: 159 test $1, R8(n) 160 jz 1f 161 mov (up), %rax 162 not %rax 163 mov %rax, (rp) 1641: 165L(don): FUNC_EXIT() 166 ret 167EPILOGUE() 168