1dnl AMD64 SSE mpn_sec_tabselect. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb cycles/limb 37C ali,evn n unal,evn n other cases 38C AMD K8,K9 1.65 1.65 1.8 39C AMD K10 0.78 0.78 0.85 40C AMD bd1 0.80 0.91 1.25 41C AMD bobcat 2.15 2.15 2.37 42C Intel P4 2.5 2.5 2.95 43C Intel core2 1.17 1.25 1.25 44C Intel NHM 0.87 0.90 0.90 45C Intel SBR 0.63 0.79 0.77 46C Intel atom 4.3 4.3 4.3 slower than plain code 47C VIA nano 1.4 5.1 3.14 too alignment dependent 48 49C NOTES 50C * We only honour the least significant 32 bits of the `which' and `nents' 51C arguments to allow efficient code using just SSE2. We would need to 52C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence. 53C * We use movd for copying between xmm and plain registers, since old gas 54C rejects movq. But gas assembles movd as movq when given a 64-bit greg. 55 56define(`rp', `%rdi') 57define(`tp', `%rsi') 58define(`n', `%rdx') 59define(`nents', `%rcx') 60define(`which', `%r8') 61 62define(`i', `%r10') 63define(`j', `%r9') 64 65C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 66C nents n rp tab which j i temp * * * * 67 68ABI_SUPPORT(DOS64) 69ABI_SUPPORT(STD64) 70 71ASM_START() 72 TEXT 73 ALIGN(16) 74PROLOGUE(mpn_sec_tabselect) 75 FUNC_ENTRY(4) 76IFDOS(` mov 56(%rsp), %r8d ') 77 78IFDOS(` add $-88, %rsp ') 79IFDOS(` movdqu %xmm6, (%rsp) ') 80IFDOS(` movdqu %xmm7, 16(%rsp) ') 81IFDOS(` movdqu %xmm8, 32(%rsp) ') 82IFDOS(` movdqu %xmm9, 48(%rsp) ') 83 84 movd which, %xmm8 85 pshufd $0, %xmm8, %xmm8 C 4 `which' copies 86 mov $1, R32(%rax) 87 movd %rax, %xmm9 88 pshufd $0, %xmm9, %xmm9 C 4 copies of 1 89 90 mov n, j 91 add $-8, j 92 js L(outer_end) 93 94L(outer_top): 95 mov nents, i 96 mov tp, %r11 97 pxor %xmm1, %xmm1 98 pxor %xmm4, %xmm4 99 pxor %xmm5, %xmm5 100 pxor %xmm6, %xmm6 101 pxor %xmm7, %xmm7 102 ALIGN(16) 103L(top): movdqa %xmm8, %xmm0 104 pcmpeqd %xmm1, %xmm0 105 paddd %xmm9, %xmm1 106 movdqu 0(tp), %xmm2 107 movdqu 16(tp), %xmm3 108 pand %xmm0, %xmm2 109 pand %xmm0, %xmm3 110 por %xmm2, %xmm4 111 por %xmm3, %xmm5 112 movdqu 32(tp), %xmm2 113 movdqu 48(tp), %xmm3 114 pand %xmm0, %xmm2 115 pand %xmm0, %xmm3 116 por %xmm2, %xmm6 117 por %xmm3, %xmm7 118 lea (tp,n,8), tp 119 add $-1, i 120 jne L(top) 121 122 movdqu %xmm4, 0(rp) 123 movdqu %xmm5, 16(rp) 124 movdqu %xmm6, 32(rp) 125 movdqu %xmm7, 48(rp) 126 127 lea 64(%r11), tp 128 lea 64(rp), rp 129 add $-8, j 130 jns L(outer_top) 131L(outer_end): 132 133 test $4, R8(n) 134 je L(b0xx) 135L(b1xx):mov nents, i 136 mov tp, %r11 137 pxor %xmm1, %xmm1 138 pxor %xmm4, %xmm4 139 pxor %xmm5, %xmm5 140 ALIGN(16) 141L(tp4): movdqa %xmm8, %xmm0 142 pcmpeqd %xmm1, %xmm0 143 paddd %xmm9, %xmm1 144 movdqu 0(tp), %xmm2 145 movdqu 16(tp), %xmm3 146 pand %xmm0, %xmm2 147 pand %xmm0, %xmm3 148 por %xmm2, %xmm4 149 por %xmm3, %xmm5 150 lea (tp,n,8), tp 151 add $-1, i 152 jne L(tp4) 153 movdqu %xmm4, 0(rp) 154 movdqu %xmm5, 16(rp) 155 lea 32(%r11), tp 156 lea 32(rp), rp 157 158L(b0xx):test $2, R8(n) 159 je L(b00x) 160L(b01x):mov nents, i 161 mov tp, %r11 162 pxor %xmm1, %xmm1 163 pxor %xmm4, %xmm4 164 ALIGN(16) 165L(tp2): movdqa %xmm8, %xmm0 166 pcmpeqd %xmm1, %xmm0 167 paddd %xmm9, %xmm1 168 movdqu 0(tp), %xmm2 169 pand %xmm0, %xmm2 170 por %xmm2, %xmm4 171 lea (tp,n,8), tp 172 add $-1, i 173 jne L(tp2) 174 movdqu %xmm4, 0(rp) 175 lea 16(%r11), tp 176 lea 16(rp), rp 177 178L(b00x):test $1, R8(n) 179 je L(b000) 180L(b001):mov nents, i 181 mov tp, %r11 182 pxor %xmm1, %xmm1 183 pxor %xmm4, %xmm4 184 ALIGN(16) 185L(tp1): movdqa %xmm8, %xmm0 186 pcmpeqd %xmm1, %xmm0 187 paddd %xmm9, %xmm1 188 movq 0(tp), %xmm2 189 pand %xmm0, %xmm2 190 por %xmm2, %xmm4 191 lea (tp,n,8), tp 192 add $-1, i 193 jne L(tp1) 194 movq %xmm4, 0(rp) 195 196L(b000): 197IFDOS(` movdqu (%rsp), %xmm6 ') 198IFDOS(` movdqu 16(%rsp), %xmm7 ') 199IFDOS(` movdqu 32(%rsp), %xmm8 ') 200IFDOS(` movdqu 48(%rsp), %xmm9 ') 201IFDOS(` add $88, %rsp ') 202 FUNC_EXIT() 203 ret 204EPILOGUE() 205