1dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE. 2 3dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. 4 5dnl Copyright 2010-2012, 2018 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb good 37C 16-byte aligned 16-byte unaligned for cpu? 38C AMD K8,K9 ? ? 39C AMD K10 1.85 (1.635) 1.9 (1.67) Y 40C AMD bd1 1.82 (1.75) 1.82 (1.75) Y 41C AMD bobcat 4.5 4.5 42C Intel P4 3.6 (3.125) 3.6 (3.125) Y 43C Intel core2 2.05 (1.67) 2.55 (1.75) 44C Intel NHM 2.05 (1.875) 2.6 (2.25) 45C Intel SBR 1.55 (1.44) 2 (1.57) Y 46C Intel atom ? ? 47C VIA nano 2.5 (2.5) 2.5 (2.5) Y 48 49C We try to do as many 16-byte operations as possible. The top-most and 50C bottom-most writes might need 8-byte operations. We always write using 51C 16-byte operations, we read with both 8-byte and 16-byte operations. 52 53C There are two inner-loops, one for when rp = ap (mod 16) and one when this is 54C not true. The aligned case reads 16+8 bytes, the unaligned case reads 55C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. 56 57C This is not yet great code: 58C (1) The unaligned case makes too many reads. 59C (2) We should do some unrolling, at least 2-way. 60C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on 61C Nano. 62 63C INPUT PARAMETERS 64define(`rp', `%rdi') 65define(`ap', `%rsi') 66define(`n', `%rdx') 67define(`cnt', `%rcx') 68 69ASM_START() 70 TEXT 71 ALIGN(16) 72PROLOGUE(mpn_lshiftc) 73 FUNC_ENTRY(4) 74 movd R32(%rcx), %xmm4 75 mov $64, R32(%rax) 76 sub R32(%rcx), R32(%rax) 77 movd R32(%rax), %xmm5 78 79 neg R32(%rcx) 80 mov -8(ap,n,8), %rax 81 shr R8(%rcx), %rax 82 83 pcmpeqb %xmm2, %xmm2 C set to 111...111 84 85 cmp $2, n 86 jle L(le2) 87 88 lea (rp,n,8), R32(%rcx) 89 test $8, R8(%rcx) 90 je L(rp_aligned) 91 92C Do one initial limb in order to make rp aligned 93 movq -8(ap,n,8), %xmm0 94 movq -16(ap,n,8), %xmm1 95 psllq %xmm4, %xmm0 96 psrlq %xmm5, %xmm1 97 por %xmm1, %xmm0 98 pxor %xmm2, %xmm0 99 movq %xmm0, -8(rp,n,8) 100 dec n 101 102L(rp_aligned): 103 lea (ap,n,8), R32(%rcx) 104 test $8, R8(%rcx) 105 je L(aent) 106 jmp L(uent) 107C ***************************************************************************** 108 109C Handle the case when ap != rp (mod 16). 110 111 ALIGN(16) 112L(utop):movq (ap,n,8), %xmm1 113 punpcklqdq 8(ap,n,8), %xmm1 114 movdqa -8(ap,n,8), %xmm0 115 psllq %xmm4, %xmm1 116 psrlq %xmm5, %xmm0 117 por %xmm1, %xmm0 118 pxor %xmm2, %xmm0 119 movdqa %xmm0, (rp,n,8) 120L(uent):sub $2, n 121 ja L(utop) 122 123 jne L(end8) 124 125 movq (ap), %xmm1 126 pxor %xmm0, %xmm0 127 punpcklqdq %xmm1, %xmm0 128 punpcklqdq 8(ap), %xmm1 129 psllq %xmm4, %xmm1 130 psrlq %xmm5, %xmm0 131 por %xmm1, %xmm0 132 pxor %xmm2, %xmm0 133 movdqa %xmm0, (rp) 134 FUNC_EXIT() 135 ret 136C ***************************************************************************** 137 138C Handle the case when ap = rp (mod 16). 139 140 ALIGN(16) 141L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] 142 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] 143 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] 144 psllq %xmm4, %xmm0 145 psrlq %xmm5, %xmm1 146 por %xmm1, %xmm0 147 pxor %xmm2, %xmm0 148 movdqa %xmm0, (rp,n,8) 149L(aent):sub $2, n 150 ja L(atop) 151 152 jne L(end8) 153 154 movdqa (ap), %xmm0 155 pxor %xmm1, %xmm1 156 punpcklqdq %xmm0, %xmm1 157 psllq %xmm4, %xmm0 158 psrlq %xmm5, %xmm1 159 por %xmm1, %xmm0 160 pxor %xmm2, %xmm0 161 movdqa %xmm0, (rp) 162 FUNC_EXIT() 163 ret 164C ***************************************************************************** 165 166 ALIGN(16) 167L(le2): jne L(end8) 168 169 movq 8(ap), %xmm0 170 movq (ap), %xmm1 171 psllq %xmm4, %xmm0 172 psrlq %xmm5, %xmm1 173 por %xmm1, %xmm0 174 pxor %xmm2, %xmm0 175 movq %xmm0, 8(rp) 176 177L(end8):movq (ap), %xmm0 178 psllq %xmm4, %xmm0 179 pxor %xmm2, %xmm0 180 movq %xmm0, (rp) 181 FUNC_EXIT() 182 ret 183EPILOGUE() 184