1dnl x86-64 mpn_lshiftc optimized for Pentium 4. 2 3dnl Copyright 2003, 2005, 2007, 2008, 2010, 2012 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C AMD K8,K9 ? 26C AMD K10 ? 27C Intel P4 4.15 28C Intel core2 ? 29C Intel corei ? 30C Intel atom ? 31C VIA nano ? 32 33C INPUT PARAMETERS 34define(`rp',`%rdi') 35define(`up',`%rsi') 36define(`n',`%rdx') 37define(`cnt',`%cl') 38 39ABI_SUPPORT(DOS64) 40ABI_SUPPORT(STD64) 41 42ASM_START() 43 TEXT 44 ALIGN(32) 45PROLOGUE(mpn_lshiftc) 46 FUNC_ENTRY(4) 47 mov -8(up,n,8), %rax 48 pcmpeqd %mm6, %mm6 C 0xffff...fff 49 movd R32(%rcx), %mm4 50 neg R32(%rcx) C put rsh count in cl 51 and $63, R32(%rcx) 52 movd R32(%rcx), %mm5 53 54 lea 1(n), R32(%r8) 55 56 shr R8(%rcx), %rax C function return value 57 58 and $3, R32(%r8) 59 je L(rol) C jump for n = 3, 7, 11, ... 60 61 dec R32(%r8) 62 jne L(1) 63C n = 4, 8, 12, ... 64 movq -8(up,n,8), %mm2 65 psllq %mm4, %mm2 66 movq -16(up,n,8), %mm0 67 pxor %mm6, %mm2 68 psrlq %mm5, %mm0 69 pandn %mm2, %mm0 70 movq %mm0, -8(rp,n,8) 71 dec n 72 jmp L(rol) 73 74L(1): dec R32(%r8) 75 je L(1x) C jump for n = 1, 5, 9, 13, ... 76C n = 2, 6, 10, 16, ... 77 movq -8(up,n,8), %mm2 78 psllq %mm4, %mm2 79 movq -16(up,n,8), %mm0 80 pxor %mm6, %mm2 81 psrlq %mm5, %mm0 82 pandn %mm2, %mm0 83 movq %mm0, -8(rp,n,8) 84 dec n 85L(1x): 86 cmp $1, n 87 je L(ast) 88 movq -8(up,n,8), %mm2 89 psllq %mm4, %mm2 90 movq -16(up,n,8), %mm3 91 psllq %mm4, %mm3 92 movq -16(up,n,8), %mm0 93 movq -24(up,n,8), %mm1 94 pxor %mm6, %mm2 95 psrlq %mm5, %mm0 96 pandn %mm2, %mm0 97 pxor %mm6, %mm3 98 psrlq %mm5, %mm1 99 pandn %mm3, %mm1 100 movq %mm0, -8(rp,n,8) 101 movq %mm1, -16(rp,n,8) 102 sub $2, n 103 104L(rol): movq -8(up,n,8), %mm2 105 psllq %mm4, %mm2 106 movq -16(up,n,8), %mm3 107 psllq %mm4, %mm3 108 109 sub $4, n 110 jb L(end) 111 ALIGN(32) 112L(top): 113 C finish stuff from lsh block 114 movq 16(up,n,8), %mm0 115 pxor %mm6, %mm2 116 movq 8(up,n,8), %mm1 117 psrlq %mm5, %mm0 118 psrlq %mm5, %mm1 119 pandn %mm2, %mm0 120 pxor %mm6, %mm3 121 movq %mm0, 24(rp,n,8) 122 movq (up,n,8), %mm0 123 pandn %mm3, %mm1 124 movq %mm1, 16(rp,n,8) 125 movq -8(up,n,8), %mm1 126 C start two new rsh 127 psrlq %mm5, %mm0 128 psrlq %mm5, %mm1 129 130 C finish stuff from rsh block 131 movq 8(up,n,8), %mm2 132 pxor %mm6, %mm0 133 movq (up,n,8), %mm3 134 psllq %mm4, %mm2 135 psllq %mm4, %mm3 136 pandn %mm0, %mm2 137 pxor %mm6, %mm1 138 movq %mm2, 8(rp,n,8) 139 movq -8(up,n,8), %mm2 140 pandn %mm1, %mm3 141 movq %mm3, (rp,n,8) 142 movq -16(up,n,8), %mm3 143 C start two new lsh 144 sub $4, n 145 psllq %mm4, %mm2 146 psllq %mm4, %mm3 147 148 jae L(top) 149 150L(end): pxor %mm6, %mm2 151 movq 8(up), %mm0 152 psrlq %mm5, %mm0 153 pandn %mm2, %mm0 154 pxor %mm6, %mm3 155 movq (up), %mm1 156 psrlq %mm5, %mm1 157 pandn %mm3, %mm1 158 movq %mm0, 16(rp) 159 movq %mm1, 8(rp) 160 161L(ast): movq (up), %mm2 162 psllq %mm4, %mm2 163 pxor %mm6, %mm2 164 movq %mm2, (rp) 165 emms 166 FUNC_EXIT() 167 ret 168EPILOGUE() 169