1dnl AMD64 mpn_lshiftc -- mpn left shift with complement, optimised for Atom. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C AMD K8,K9 ? 26C AMD K10 ? 27C Intel P4 ? 28C Intel core2 ? 29C Intel NHM ? 30C Intel SBR ? 31C Intel atom 5 32C VIA nano ? 33 34C TODO 35C * Consider using 4-way unrolling. We reach 4.5 c/l, but the code is 2.5 36C times larger. 37 38C INPUT PARAMETERS 39define(`rp', `%rdi') 40define(`up', `%rsi') 41define(`n', `%rdx') 42define(`cnt', `%rcx') 43 44ABI_SUPPORT(DOS64) 45ABI_SUPPORT(STD64) 46 47ASM_START() 48 TEXT 49 ALIGN(16) 50PROLOGUE(mpn_lshiftc) 51 FUNC_ENTRY(4) 52 lea -8(up,n,8), up 53 lea -8(rp,n,8), rp 54 shr R32(n) 55 mov (up), %rax 56 jnc L(evn) 57 58 mov %rax, %r11 59 shl R8(%rcx), %r11 60 neg R8(%rcx) 61 shr R8(%rcx), %rax 62 test n, n 63 jnz L(gt1) 64 not %r11 65 mov %r11, (rp) 66 FUNC_EXIT() 67 ret 68 69L(gt1): mov -8(up), %r8 70 mov %r8, %r10 71 shr R8(%rcx), %r8 72 jmp L(lo1) 73 74L(evn): mov %rax, %r10 75 neg R8(%rcx) 76 shr R8(%rcx), %rax 77 mov -8(up), %r9 78 mov %r9, %r11 79 shr R8(%rcx), %r9 80 neg R8(%rcx) 81 lea 8(rp), rp 82 lea -8(up), up 83 jmp L(lo0) 84 85C ALIGN(16) 86L(top): shl R8(%rcx), %r10 87 or %r10, %r9 88 shl R8(%rcx), %r11 89 not %r9 90 neg R8(%rcx) 91 mov -8(up), %r8 92 lea -16(rp), rp 93 mov %r8, %r10 94 shr R8(%rcx), %r8 95 mov %r9, 8(rp) 96L(lo1): or %r11, %r8 97 mov -16(up), %r9 98 mov %r9, %r11 99 shr R8(%rcx), %r9 100 lea -16(up), up 101 neg R8(%rcx) 102 not %r8 103 mov %r8, (rp) 104L(lo0): dec n 105 jg L(top) 106 107L(end): shl R8(%rcx), %r10 108 or %r10, %r9 109 not %r9 110 shl R8(%rcx), %r11 111 not %r11 112 mov %r9, -8(rp) 113 mov %r11, -16(rp) 114 FUNC_EXIT() 115 ret 116EPILOGUE() 117