1dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE including fast movdqu. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2010-2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb cycles/limb good 37C aligned unaligned best seen for cpu? 38C AMD K8,K9 3 3 ? no, use shl/shr 39C AMD K10 1.8-2.0 1.8-2.0 ? yes 40C AMD bd1 1.9 1.9 ? yes 41C AMD bobcat 3.67 3.67 yes, bad for n < 20 42C Intel P4 4.75 4.75 ? no, slow movdqu 43C Intel core2 2.27 2.27 ? no, use shld/shrd 44C Intel NHM 2.15 2.15 ? no, use shld/shrd 45C Intel SBR 1.45 1.45 ? yes, bad for n = 4-6 46C Intel atom 12.9 12.9 ? no 47C VIA nano 6.18 6.44 ? no, slow movdqu 48 49C We try to do as many aligned 16-byte operations as possible. The top-most 50C and bottom-most writes might need 8-byte operations. 51C 52C This variant rely on fast load movdqu, and uses it even for aligned operands, 53C in order to avoid the need for two separate loops. 54C 55C TODO 56C * Could 2-limb wind-down code be simplified? 57C * Improve basecase code, using shld/shrd for SBR, discrete integer shifts 58C for other affected CPUs. 59 60C INPUT PARAMETERS 61define(`rp', `%rdi') 62define(`ap', `%rsi') 63define(`n', `%rdx') 64define(`cnt', `%rcx') 65 66ASM_START() 67 TEXT 68 ALIGN(64) 69PROLOGUE(mpn_lshiftc) 70 FUNC_ENTRY(4) 71 movd R32(%rcx), %xmm4 72 mov $64, R32(%rax) 73 sub R32(%rcx), R32(%rax) 74 movd R32(%rax), %xmm5 75 76 neg R32(%rcx) 77 mov -8(ap,n,8), %rax 78 shr R8(%rcx), %rax 79 80 pcmpeqb %xmm3, %xmm3 C set to 111...111 81 82 cmp $3, n 83 jle L(bc) 84 85 lea (rp,n,8), R32(%rcx) 86 test $8, R8(%rcx) 87 jz L(rp_aligned) 88 89C Do one initial limb in order to make rp aligned 90 movq -8(ap,n,8), %xmm0 91 movq -16(ap,n,8), %xmm1 92 psllq %xmm4, %xmm0 93 psrlq %xmm5, %xmm1 94 por %xmm1, %xmm0 95 pxor %xmm3, %xmm0 96 movq %xmm0, -8(rp,n,8) 97 dec n 98 99L(rp_aligned): 100 lea 1(n), %r8d 101 102 and $6, R32(%r8) 103 jz L(ba0) 104 cmp $4, R32(%r8) 105 jz L(ba4) 106 jc L(ba2) 107L(ba6): add $-4, n 108 jmp L(i56) 109L(ba0): add $-6, n 110 jmp L(i70) 111L(ba4): add $-2, n 112 jmp L(i34) 113L(ba2): add $-8, n 114 jle L(end) 115 116 ALIGN(16) 117L(top): movdqu 40(ap,n,8), %xmm1 118 movdqu 48(ap,n,8), %xmm0 119 psllq %xmm4, %xmm0 120 psrlq %xmm5, %xmm1 121 por %xmm1, %xmm0 122 pxor %xmm3, %xmm0 123 movdqa %xmm0, 48(rp,n,8) 124L(i70): 125 movdqu 24(ap,n,8), %xmm1 126 movdqu 32(ap,n,8), %xmm0 127 psllq %xmm4, %xmm0 128 psrlq %xmm5, %xmm1 129 por %xmm1, %xmm0 130 pxor %xmm3, %xmm0 131 movdqa %xmm0, 32(rp,n,8) 132L(i56): 133 movdqu 8(ap,n,8), %xmm1 134 movdqu 16(ap,n,8), %xmm0 135 psllq %xmm4, %xmm0 136 psrlq %xmm5, %xmm1 137 por %xmm1, %xmm0 138 pxor %xmm3, %xmm0 139 movdqa %xmm0, 16(rp,n,8) 140L(i34): 141 movdqu -8(ap,n,8), %xmm1 142 movdqu (ap,n,8), %xmm0 143 psllq %xmm4, %xmm0 144 psrlq %xmm5, %xmm1 145 por %xmm1, %xmm0 146 pxor %xmm3, %xmm0 147 movdqa %xmm0, (rp,n,8) 148 sub $8, n 149 jg L(top) 150 151L(end): test $1, R8(n) 152 jnz L(end8) 153 154 movdqu (ap), %xmm1 155 pxor %xmm0, %xmm0 156 punpcklqdq %xmm1, %xmm0 157 psllq %xmm4, %xmm1 158 psrlq %xmm5, %xmm0 159 por %xmm1, %xmm0 160 pxor %xmm3, %xmm0 161 movdqa %xmm0, (rp) 162 FUNC_EXIT() 163 ret 164 165C Basecase 166 ALIGN(16) 167L(bc): dec R32(n) 168 jz L(end8) 169 170 movq (ap,n,8), %xmm1 171 movq -8(ap,n,8), %xmm0 172 psllq %xmm4, %xmm1 173 psrlq %xmm5, %xmm0 174 por %xmm1, %xmm0 175 pxor %xmm3, %xmm0 176 movq %xmm0, (rp,n,8) 177 sub $2, R32(n) 178 jl L(end8) 179 movq 8(ap), %xmm1 180 movq (ap), %xmm0 181 psllq %xmm4, %xmm1 182 psrlq %xmm5, %xmm0 183 por %xmm1, %xmm0 184 pxor %xmm3, %xmm0 185 movq %xmm0, 8(rp) 186 187L(end8):movq (ap), %xmm0 188 psllq %xmm4, %xmm0 189 pxor %xmm3, %xmm0 190 movq %xmm0, (rp) 191 FUNC_EXIT() 192 ret 193EPILOGUE() 194