1dnl X86-32 and X86-64 mpn_popcount using SSE2. 2 3dnl Copyright 2006, 2007, 2011, 2015 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34 35C 32-bit popcount hamdist 36C cycles/limb cycles/limb 37C P5 - 38C P6 model 0-8,10-12 - 39C P6 model 9 (Banias) ? 40C P6 model 13 (Dothan) 4 41C P4 model 0 (Willamette) ? 42C P4 model 1 (?) ? 43C P4 model 2 (Northwood) 3.9 44C P4 model 3 (Prescott) ? 45C P4 model 4 (Nocona) ? 46C AMD K6 - 47C AMD K7 - 48C AMD K8 ? 49 50C 64-bit popcount hamdist 51C cycles/limb cycles/limb 52C P4 model 4 (Nocona): 8 53C AMD K8,K9 7.5 54C AMD K10 3.5 55C Intel core2 3.68 56C Intel corei 3.15 57C Intel atom 10.8 58C VIA nano 6.5 59 60C TODO 61C * Make an mpn_hamdist based on this. Alignment could either be handled by 62C using movdqu for one operand and movdqa for the other, or by painfully 63C shifting as we go. Unfortunately, there seem to be no usable shift 64C instruction, except for one that takes an immediate count. 65C * It would probably be possible to cut a few cycles/limb using software 66C pipelining. 67C * There are 35 decode slots unused by the SSE2 instructions. Loop control 68C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel 69C integer based popcount. Such a combined loop would handle 6 limbs in 70C about 30 cycles on K8. 71C * We could save a byte or two by using 32-bit operations on areg. 72C * Check if using movdqa to a temp of and then register-based pand is faster. 73 74ifelse(GMP_LIMB_BITS,`32', 75` define(`up', `%edx') 76 define(`n', `%ecx') 77 define(`areg',`%eax') 78 define(`breg',`%ebx') 79 define(`zero',`%xmm4') 80 define(`LIMB32',` $1') 81 define(`LIMB64',`dnl') 82',` 83 define(`up', `%rdi') 84 define(`n', `%rsi') 85 define(`areg',`%rax') 86 define(`breg',`%rdx') 87 define(`zero',`%xmm8') 88 define(`LIMB32',`dnl') 89 define(`LIMB64',` $1') 90') 91 92define(`mm01010101',`%xmm6') 93define(`mm00110011',`%xmm7') 94define(`mm00001111',`%xmm2') 95 96define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 97define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) 98define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) 99 100undefine(`psadbw') C override inherited m4 version 101 102C This file is shared between 32-bit and 64-bit builds. Only the former has 103C LEAL. Default LEAL as an alias of LEA. 104ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') 105 106ASM_START() 107 108C Make cnsts global to work around Apple relocation bug. 109ifdef(`DARWIN',` 110 define(`cnsts', MPN(popccnsts)) 111 GLOBL cnsts') 112 113 TEXT 114 ALIGN(32) 115PROLOGUE(mpn_popcount) 116 117LIMB32(`mov 4(%esp), up ') 118LIMB32(`mov 8(%esp), n ') 119LIMB32(`push %ebx ') 120 121 pxor %xmm3, %xmm3 C zero grand total count 122LIMB64(`pxor zero, zero ') 123ifdef(`PIC',` 124 LEAL( cnsts, breg) 125',` 126LIMB32(`mov $cnsts, breg ') 127LIMB64(`movabs $cnsts, breg ') 128') 129 130 movdqa -48(breg), mm01010101 131 movdqa -32(breg), mm00110011 132 movdqa -16(breg), mm00001111 133 134 mov up, areg 135 and $-16, up C round `up' down to 128-bit boundary 136 and $12, areg C 32:areg = 0, 4, 8, 12 137 C 64:areg = 0, 8 138 movdqa (up), %xmm0 139 pand 64(breg,areg,4), %xmm0 140 shr $m4_log2(GMP_LIMB_BYTES), %eax 141 add areg, n C compensate n for rounded down `up' 142 143 pxor %xmm4, %xmm4 144 sub $LIMBS_PER_XMM, n 145 jbe L(sum) 146 147 sub $LIMBS_PER_XMM, n 148 ja L(ent) 149 jmp L(lsum) 150 151 ALIGN(16) 152L(top): movdqa (up), %xmm0 153L(ent): movdqa 16(up), %xmm4 154 155 movdqa %xmm0, %xmm1 156 movdqa %xmm4, %xmm5 157 psrld $1, %xmm0 158 psrld $1, %xmm4 159 pand mm01010101, %xmm0 160 pand mm01010101, %xmm4 161 psubd %xmm0, %xmm1 162 psubd %xmm4, %xmm5 163 164 movdqa %xmm1, %xmm0 165 movdqa %xmm5, %xmm4 166 psrlq $2, %xmm1 167 psrlq $2, %xmm5 168 pand mm00110011, %xmm0 169 pand mm00110011, %xmm4 170 pand mm00110011, %xmm1 171 pand mm00110011, %xmm5 172 paddq %xmm0, %xmm1 173 paddq %xmm4, %xmm5 174 175LIMB32(`pxor zero, zero ') 176 177 add $32, up 178 sub $LIMBS_PER_2XMM, n 179 180 paddq %xmm5, %xmm1 181 movdqa %xmm1, %xmm0 182 psrlq $4, %xmm1 183 pand mm00001111, %xmm0 184 pand mm00001111, %xmm1 185 paddq %xmm0, %xmm1 186 187 psadbw zero, %xmm1 188 paddq %xmm1, %xmm3 C add to grand total 189 190 jnc L(top) 191L(end): 192 add $LIMBS_PER_2XMM, n 193 jz L(rt) 194 movdqa (up), %xmm0 195 pxor %xmm4, %xmm4 196 sub $LIMBS_PER_XMM, n 197 jbe L(sum) 198L(lsum): 199 movdqa %xmm0, %xmm4 200 movdqa 16(up), %xmm0 201L(sum): 202 shl $m4_log2(GMP_LIMB_BYTES), n 203 and $12, n 204 pand (breg,n,4), %xmm0 205 206 movdqa %xmm0, %xmm1 207 movdqa %xmm4, %xmm5 208 psrld $1, %xmm0 209 psrld $1, %xmm4 210 pand mm01010101, %xmm0 211 pand mm01010101, %xmm4 212 psubd %xmm0, %xmm1 213 psubd %xmm4, %xmm5 214 215 movdqa %xmm1, %xmm0 216 movdqa %xmm5, %xmm4 217 psrlq $2, %xmm1 218 psrlq $2, %xmm5 219 pand mm00110011, %xmm0 220 pand mm00110011, %xmm4 221 pand mm00110011, %xmm1 222 pand mm00110011, %xmm5 223 paddq %xmm0, %xmm1 224 paddq %xmm4, %xmm5 225 226LIMB32(`pxor zero, zero ') 227 228 paddq %xmm5, %xmm1 229 movdqa %xmm1, %xmm0 230 psrlq $4, %xmm1 231 pand mm00001111, %xmm0 232 pand mm00001111, %xmm1 233 paddq %xmm0, %xmm1 234 235 psadbw zero, %xmm1 236 paddq %xmm1, %xmm3 C add to grand total 237 238 239C Add the two 64-bit halves of the grand total counter 240L(rt): movdqa %xmm3, %xmm0 241 psrldq $8, %xmm3 242 paddq %xmm3, %xmm0 243 movd %xmm0, areg C movq avoided due to gas bug 244 245LIMB32(`pop %ebx ') 246 ret 247 248EPILOGUE() 249DEF_OBJECT(dummy,16) 250C Three magic constants used for masking out bits 251 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 252 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 253 254 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 255 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 256 257 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 258 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 259cnsts: 260C Masks for high end of number 261 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 262 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 263 264 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 265 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 266 267 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 268 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 269 270 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 271 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 272C Masks for low end of number 273 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 274 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 275 276 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 277 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 278 279 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 280 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 281 282 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 283 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 284END_OBJECT(dummy) 285ASM_END() 286