1dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE. 2 3dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund. 4 5dnl Copyright 2010, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24 25C cycles/limb cycles/limb good 26C 16-byte aligned 16-byte unaligned for cpu? 27C AMD K8,K9 ? ? 28C AMD K10 1.85 (1.635) 1.9 (1.67) Y 29C AMD bd1 1.82 (1.75) 1.82 (1.75) Y 30C AMD bobcat 4.5 4.5 31C Intel P4 3.6 (3.125) 3.6 (3.125) Y 32C Intel core2 2.05 (1.67) 2.55 (1.75) 33C Intel NHM 2.05 (1.875) 2.6 (2.25) 34C Intel SBR 1.55 (1.44) 2 (1.57) Y 35C Intel atom ? ? 36C VIA nano 2.5 (2.5) 2.5 (2.5) Y 37 38C We try to do as many 16-byte operations as possible. The top-most and 39C bottom-most writes might need 8-byte operations. We always write using 40C 16-byte operations, we read with both 8-byte and 16-byte operations. 41 42C There are two inner-loops, one for when rp = ap (mod 16) and one when this is 43C not true. The aligned case reads 16+8 bytes, the unaligned case reads 44C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented. 45 46C This is not yet great code: 47C (1) The unaligned case makes too many reads. 48C (2) We should do some unrolling, at least 2-way. 49C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on 50C Nano. 51 52C INPUT PARAMETERS 53define(`rp', `%rdi') 54define(`ap', `%rsi') 55define(`n', `%rdx') 56define(`cnt', `%rcx') 57 58ASM_START() 59 TEXT 60 ALIGN(16) 61PROLOGUE(mpn_lshiftc) 62 movd R32(%rcx), %xmm4 63 mov $64, R32(%rax) 64 sub R32(%rcx), R32(%rax) 65 movd R32(%rax), %xmm5 66 67 neg R32(%rcx) 68 mov -8(ap,n,8), %rax 69 shr R8(%rcx), %rax 70 71 pcmpeqb %xmm7, %xmm7 C set to 111...111 72 73 cmp $2, n 74 jle L(le2) 75 76 lea (rp,n,8), R32(%rcx) 77 test $8, R8(%rcx) 78 je L(rp_aligned) 79 80C Do one initial limb in order to make rp aligned 81 movq -8(ap,n,8), %xmm0 82 movq -16(ap,n,8), %xmm1 83 psllq %xmm4, %xmm0 84 psrlq %xmm5, %xmm1 85 por %xmm1, %xmm0 86 pxor %xmm7, %xmm0 87 movq %xmm0, -8(rp,n,8) 88 dec n 89 90L(rp_aligned): 91 lea (ap,n,8), R32(%rcx) 92 test $8, R8(%rcx) 93 je L(aent) 94 jmp L(uent) 95C ***************************************************************************** 96 97C Handle the case when ap != rp (mod 16). 98 99 ALIGN(16) 100L(utop):movq (ap,n,8), %xmm1 101 punpcklqdq 8(ap,n,8), %xmm1 102 movdqa -8(ap,n,8), %xmm0 103 psllq %xmm4, %xmm1 104 psrlq %xmm5, %xmm0 105 por %xmm1, %xmm0 106 pxor %xmm7, %xmm0 107 movdqa %xmm0, (rp,n,8) 108L(uent):sub $2, n 109 ja L(utop) 110 111 jne L(end8) 112 113 movq (ap), %xmm1 114 pxor %xmm0, %xmm0 115 punpcklqdq %xmm1, %xmm0 116 punpcklqdq 8(ap), %xmm1 117 psllq %xmm4, %xmm1 118 psrlq %xmm5, %xmm0 119 por %xmm1, %xmm0 120 pxor %xmm7, %xmm0 121 movdqa %xmm0, (rp) 122 ret 123C ***************************************************************************** 124 125C Handle the case when ap = rp (mod 16). 126 127 ALIGN(16) 128L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2] 129 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3] 130 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3] 131 psllq %xmm4, %xmm0 132 psrlq %xmm5, %xmm1 133 por %xmm1, %xmm0 134 pxor %xmm7, %xmm0 135 movdqa %xmm0, (rp,n,8) 136L(aent):sub $2, n 137 ja L(atop) 138 139 jne L(end8) 140 141 movdqa (ap), %xmm0 142 pxor %xmm1, %xmm1 143 punpcklqdq %xmm0, %xmm1 144 psllq %xmm4, %xmm0 145 psrlq %xmm5, %xmm1 146 por %xmm1, %xmm0 147 pxor %xmm7, %xmm0 148 movdqa %xmm0, (rp) 149 ret 150C ***************************************************************************** 151 152 ALIGN(16) 153L(le2): jne L(end8) 154 155 movq 8(ap), %xmm0 156 movq (ap), %xmm1 157 psllq %xmm4, %xmm0 158 psrlq %xmm5, %xmm1 159 por %xmm1, %xmm0 160 pxor %xmm7, %xmm0 161 movq %xmm0, 8(rp) 162 163L(end8):movq (ap), %xmm0 164 psllq %xmm4, %xmm0 165 pxor %xmm7, %xmm0 166 movq %xmm0, (rp) 167 ret 168EPILOGUE() 169