1dnl AMD64 mpn_lshift optimised for CPUs with fast SSE including fast movdqu. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2010-2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb cycles/limb good 37C aligned unaligned best seen for cpu? 38C AMD K8,K9 3 3 2.35 no, use shl/shr 39C AMD K10 1.5-1.8 1.5-1.8 1.33 yes 40C AMD bd1 1.7-1.9 1.7-1.9 1.33 yes 41C AMD bobcat 3.17 3.17 yes, bad for n < 20 42C Intel P4 4.67 4.67 2.7 no, slow movdqu 43C Intel core2 2.15 2.15 1.25 no, use shld/shrd 44C Intel NHM 1.66 1.66 1.25 no, use shld/shrd 45C Intel SBR 1.3 1.3 1.25 yes, bad for n = 4-6 46C Intel atom 11.7 11.7 4.5 no 47C VIA nano 5.7 5.95 2.0 no, slow movdqu 48 49C We try to do as many aligned 16-byte operations as possible. The top-most 50C and bottom-most writes might need 8-byte operations. 51C 52C This variant rely on fast load movdqu, and uses it even for aligned operands, 53C in order to avoid the need for two separate loops. 54C 55C TODO 56C * Could 2-limb wind-down code be simplified? 57C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 58C for other affected CPUs. 59 60C INPUT PARAMETERS 61define(`rp', `%rdi') 62define(`ap', `%rsi') 63define(`n', `%rdx') 64define(`cnt', `%rcx') 65 66ASM_START() 67 TEXT 68 ALIGN(64) 69PROLOGUE(mpn_lshift) 70 FUNC_ENTRY(4) 71 movd R32(%rcx), %xmm4 72 mov $64, R32(%rax) 73 sub R32(%rcx), R32(%rax) 74 movd R32(%rax), %xmm5 75 76 neg R32(%rcx) 77 mov -8(ap,n,8), %rax 78 shr R8(%rcx), %rax 79 80 cmp $3, n 81 jle L(bc) 82 83 lea (rp,n,8), R32(%rcx) 84 test $8, R8(%rcx) 85 jz L(rp_aligned) 86 87C Do one initial limb in order to make rp aligned 88 movq -8(ap,n,8), %xmm0 89 movq -16(ap,n,8), %xmm1 90 psllq %xmm4, %xmm0 91 psrlq %xmm5, %xmm1 92 por %xmm1, %xmm0 93 movq %xmm0, -8(rp,n,8) 94 dec n 95 96L(rp_aligned): 97 lea 1(n), %r8d 98 99 and $6, R32(%r8) 100 jz L(ba0) 101 cmp $4, R32(%r8) 102 jz L(ba4) 103 jc L(ba2) 104L(ba6): add $-4, n 105 jmp L(i56) 106L(ba0): add $-6, n 107 jmp L(i70) 108L(ba4): add $-2, n 109 jmp L(i34) 110L(ba2): add $-8, n 111 jle L(end) 112 113 ALIGN(16) 114L(top): movdqu 40(ap,n,8), %xmm1 115 movdqu 48(ap,n,8), %xmm0 116 psllq %xmm4, %xmm0 117 psrlq %xmm5, %xmm1 118 por %xmm1, %xmm0 119 movdqa %xmm0, 48(rp,n,8) 120L(i70): 121 movdqu 24(ap,n,8), %xmm1 122 movdqu 32(ap,n,8), %xmm0 123 psllq %xmm4, %xmm0 124 psrlq %xmm5, %xmm1 125 por %xmm1, %xmm0 126 movdqa %xmm0, 32(rp,n,8) 127L(i56): 128 movdqu 8(ap,n,8), %xmm1 129 movdqu 16(ap,n,8), %xmm0 130 psllq %xmm4, %xmm0 131 psrlq %xmm5, %xmm1 132 por %xmm1, %xmm0 133 movdqa %xmm0, 16(rp,n,8) 134L(i34): 135 movdqu -8(ap,n,8), %xmm1 136 movdqu (ap,n,8), %xmm0 137 psllq %xmm4, %xmm0 138 psrlq %xmm5, %xmm1 139 por %xmm1, %xmm0 140 movdqa %xmm0, (rp,n,8) 141 sub $8, n 142 jg L(top) 143 144L(end): test $1, R8(n) 145 jnz L(end8) 146 147 movdqu (ap), %xmm1 148 pxor %xmm0, %xmm0 149 punpcklqdq %xmm1, %xmm0 150 psllq %xmm4, %xmm1 151 psrlq %xmm5, %xmm0 152 por %xmm1, %xmm0 153 movdqa %xmm0, (rp) 154 FUNC_EXIT() 155 ret 156 157C Basecase 158 ALIGN(16) 159L(bc): dec R32(n) 160 jz L(end8) 161 162 movq (ap,n,8), %xmm1 163 movq -8(ap,n,8), %xmm0 164 psllq %xmm4, %xmm1 165 psrlq %xmm5, %xmm0 166 por %xmm1, %xmm0 167 movq %xmm0, (rp,n,8) 168 sub $2, R32(n) 169 jl L(end8) 170 movq 8(ap), %xmm1 171 movq (ap), %xmm0 172 psllq %xmm4, %xmm1 173 psrlq %xmm5, %xmm0 174 por %xmm1, %xmm0 175 movq %xmm0, 8(rp) 176 177L(end8):movq (ap), %xmm0 178 psllq %xmm4, %xmm0 179 movq %xmm0, (rp) 180 FUNC_EXIT() 181 ret 182EPILOGUE() 183