1dnl AMD64 SSE mpn_sec_tabselect. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb cycles/limb 37C ali,evn n unal,evn n other cases 38C AMD K8,K9 1.65 1.65 1.8 39C AMD K10 0.78 0.78 0.85 40C AMD bd1 0.80 0.91 1.25 41C AMD bobcat 2.15 2.15 2.37 42C Intel P4 2.5 2.5 2.95 43C Intel core2 1.17 1.25 1.25 44C Intel NHM 0.87 0.90 0.90 45C Intel SBR 0.63 0.79 0.77 46C Intel atom 4.3 4.3 4.3 slower than plain code 47C VIA nano 1.4 5.1 3.14 too alignment dependent 48 49C NOTES 50C * We only honour the least significant 32 bits of the `which' and `nents' 51C arguments to allow efficient code using just SSE2. We would need to 52C either use the SSE4_1 pcmpeqq, or find some other SSE2 sequence. 53C * We use movd for copying between xmm and plain registers, since old gas 54C rejects movq. But gas assembles movd as movq when given a 64-bit greg. 55 56define(`rp', `%rdi') 57define(`tp', `%rsi') 58define(`n', `%rdx') 59define(`nents', `%rcx') 60define(`which', `%r8') 61 62define(`i', `%r10') 63define(`j', `%r9') 64 65C rax rbx rcx rdx rdi rsi rbp r8 r9 r10 r11 r12 r13 r14 r15 66C nents n rp tab which j i temp * * * * 67 68ABI_SUPPORT(DOS64) 69ABI_SUPPORT(STD64) 70 71ASM_START() 72 TEXT 73 ALIGN(16) 74PROLOGUE(mpn_sec_tabselect) 75 FUNC_ENTRY(4) 76IFDOS(` mov 56(%rsp), %r8d ') 77 78 movd which, %xmm8 79 pshufd $0, %xmm8, %xmm8 C 4 `which' copies 80 mov $1, R32(%rax) 81 movd %rax, %xmm9 82 pshufd $0, %xmm9, %xmm9 C 4 copies of 1 83 84 mov n, j 85 add $-8, j 86 js L(outer_end) 87 88L(outer_top): 89 mov nents, i 90 mov tp, %r11 91 pxor %xmm13, %xmm13 92 pxor %xmm4, %xmm4 93 pxor %xmm5, %xmm5 94 pxor %xmm6, %xmm6 95 pxor %xmm7, %xmm7 96 ALIGN(16) 97L(top): movdqa %xmm8, %xmm0 98 pcmpeqd %xmm13, %xmm0 99 paddd %xmm9, %xmm13 100 movdqu 0(tp), %xmm2 101 movdqu 16(tp), %xmm3 102 pand %xmm0, %xmm2 103 pand %xmm0, %xmm3 104 por %xmm2, %xmm4 105 por %xmm3, %xmm5 106 movdqu 32(tp), %xmm2 107 movdqu 48(tp), %xmm3 108 pand %xmm0, %xmm2 109 pand %xmm0, %xmm3 110 por %xmm2, %xmm6 111 por %xmm3, %xmm7 112 lea (tp,n,8), tp 113 add $-1, i 114 jne L(top) 115 116 movdqu %xmm4, 0(rp) 117 movdqu %xmm5, 16(rp) 118 movdqu %xmm6, 32(rp) 119 movdqu %xmm7, 48(rp) 120 121 lea 64(%r11), tp 122 lea 64(rp), rp 123 add $-8, j 124 jns L(outer_top) 125L(outer_end): 126 127 test $4, R8(n) 128 je L(b0xx) 129L(b1xx):mov nents, i 130 mov tp, %r11 131 pxor %xmm13, %xmm13 132 pxor %xmm4, %xmm4 133 pxor %xmm5, %xmm5 134 ALIGN(16) 135L(tp4): movdqa %xmm8, %xmm0 136 pcmpeqd %xmm13, %xmm0 137 paddd %xmm9, %xmm13 138 movdqu 0(tp), %xmm2 139 movdqu 16(tp), %xmm3 140 pand %xmm0, %xmm2 141 pand %xmm0, %xmm3 142 por %xmm2, %xmm4 143 por %xmm3, %xmm5 144 lea (tp,n,8), tp 145 add $-1, i 146 jne L(tp4) 147 movdqu %xmm4, 0(rp) 148 movdqu %xmm5, 16(rp) 149 lea 32(%r11), tp 150 lea 32(rp), rp 151 152L(b0xx):test $2, R8(n) 153 je L(b00x) 154L(b01x):mov nents, i 155 mov tp, %r11 156 pxor %xmm13, %xmm13 157 pxor %xmm4, %xmm4 158 ALIGN(16) 159L(tp2): movdqa %xmm8, %xmm0 160 pcmpeqd %xmm13, %xmm0 161 paddd %xmm9, %xmm13 162 movdqu 0(tp), %xmm2 163 pand %xmm0, %xmm2 164 por %xmm2, %xmm4 165 lea (tp,n,8), tp 166 add $-1, i 167 jne L(tp2) 168 movdqu %xmm4, 0(rp) 169 lea 16(%r11), tp 170 lea 16(rp), rp 171 172L(b00x):test $1, R8(n) 173 je L(b000) 174L(b001):mov nents, i 175 mov tp, %r11 176 pxor %xmm13, %xmm13 177 pxor %xmm4, %xmm4 178 ALIGN(16) 179L(tp1): movdqa %xmm8, %xmm0 180 pcmpeqd %xmm13, %xmm0 181 paddd %xmm9, %xmm13 182 movq 0(tp), %xmm2 183 pand %xmm0, %xmm2 184 por %xmm2, %xmm4 185 lea (tp,n,8), tp 186 add $-1, i 187 jne L(tp1) 188 movq %xmm4, 0(rp) 189 190L(b000):FUNC_EXIT() 191 ret 192EPILOGUE() 193