1dnl AMD64 mpn_lshift optimised for CPUs with fast SSE. 2 3dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. 4 5dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb good 37C 16-byte aligned 16-byte unaligned for cpu? 38C AMD K8,K9 ? ? 39C AMD K10 1.68 (1.45) 1.75 (1.49) Y 40C AMD bd1 1.82 (1.75) 1.82 (1.75) Y 41C AMD bobcat 4 4 42C Intel P4 3 (2.7) 3 (2.7) Y 43C Intel core2 2.05 (1.67) 2.55 (1.75) 44C Intel NHM 2.05 (1.75) 2.09 (2) 45C Intel SBR 1.5 (1.3125) 1.5 (1.4375) Y 46C Intel atom ? ? 47C VIA nano 2.25 (2) 2.5 (2) Y 48 49C We try to do as many 16-byte operations as possible. The top-most and 50C bottom-most writes might need 8-byte operations. 51 52C There are two inner-loops, one for when rp = ap (mod 16) and one when this is 53C not true. The aligned case reads 16+8 bytes, the unaligned case reads 54C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. 55 56C This is not yet great code: 57C (1) The unaligned case makes many reads. 58C (2) We should do some unrolling, at least 2-way. 59C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on 60C Nano. 61 62C INPUT PARAMETERS 63define(`rp', `%rdi') 64define(`ap', `%rsi') 65define(`n', `%rdx') 66define(`cnt', `%rcx') 67 68ASM_START() 69 TEXT 70 ALIGN(64) 71PROLOGUE(mpn_lshift) 72 FUNC_ENTRY(4) 73 movd R32(%rcx), %xmm4 74 mov $64, R32(%rax) 75 sub R32(%rcx), R32(%rax) 76 movd R32(%rax), %xmm5 77 78 neg R32(%rcx) 79 mov -8(ap,n,8), %rax 80 shr R8(%rcx), %rax 81 82 cmp $2, n 83 jle L(le2) 84 85 lea (rp,n,8), R32(%rcx) 86 test $8, R8(%rcx) 87 je L(rp_aligned) 88 89C Do one initial limb in order to make rp aligned 90 movq -8(ap,n,8), %xmm0 91 movq -16(ap,n,8), %xmm1 92 psllq %xmm4, %xmm0 93 psrlq %xmm5, %xmm1 94 por %xmm1, %xmm0 95 movq %xmm0, -8(rp,n,8) 96 dec n 97 98L(rp_aligned): 99 lea (ap,n,8), R32(%rcx) 100 test $8, R8(%rcx) 101 je L(aent) 102 jmp L(uent) 103C ***************************************************************************** 104 105C Handle the case when ap != rp (mod 16). 106 107 ALIGN(16) 108L(utop):movdqa -8(ap,n,8), %xmm0 109 movq (ap,n,8), %xmm1 110 punpcklqdq 8(ap,n,8), %xmm1 111 psllq %xmm4, %xmm1 112 psrlq %xmm5, %xmm0 113 por %xmm1, %xmm0 114 movdqa %xmm0, (rp,n,8) 115L(uent):sub $2, n 116 ja L(utop) 117 118 jne L(end8) 119 120 movq (ap), %xmm1 121 pxor %xmm0, %xmm0 122 punpcklqdq %xmm1, %xmm0 123 punpcklqdq 8(ap), %xmm1 124 psllq %xmm4, %xmm1 125 psrlq %xmm5, %xmm0 126 por %xmm1, %xmm0 127 movdqa %xmm0, (rp) 128 FUNC_EXIT() 129 ret 130C ***************************************************************************** 131 132C Handle the case when ap = rp (mod 16). 133 134 ALIGN(16) 135L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] 136 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] 137 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] 138 psllq %xmm4, %xmm0 139 psrlq %xmm5, %xmm1 140 por %xmm1, %xmm0 141 movdqa %xmm0, (rp,n,8) 142L(aent): 143 sub $2, n 144 ja L(atop) 145 jne L(end8) 146 147 movdqa (ap), %xmm1 148 pxor %xmm0, %xmm0 149 punpcklqdq %xmm1, %xmm0 150 psllq %xmm4, %xmm1 151 psrlq %xmm5, %xmm0 152 por %xmm1, %xmm0 153 movdqa %xmm0, (rp) 154 FUNC_EXIT() 155 ret 156C ***************************************************************************** 157 158 ALIGN(16) 159L(le2): jne L(end8) 160 161 movq 8(ap), %xmm0 162 movq (ap), %xmm1 163 psllq %xmm4, %xmm0 164 psrlq %xmm5, %xmm1 165 por %xmm1, %xmm0 166 movq %xmm0, 8(rp) 167 168L(end8):movq (ap), %xmm0 169 psllq %xmm4, %xmm0 170 movq %xmm0, (rp) 171 FUNC_EXIT() 172 ret 173EPILOGUE() 174