1dnl X86-32 and X86-64 mpn_popcount using SSE2. 2 3dnl Copyright 2006, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23 24C 32-bit popcount hamdist 25C cycles/limb cycles/limb 26C P5: - 27C P6 model 0-8,10-12) - 28C P6 model 9 (Banias) ? 29C P6 model 13 (Dothan) 4 30C P4 model 0 (Willamette) ? 31C P4 model 1 (?) ? 32C P4 model 2 (Northwood) 3.9 33C P4 model 3 (Prescott) ? 34C P4 model 4 (Nocona) ? 35C K6: - 36C K7: - 37C K8: ? 38 39C 64-bit popcount hamdist 40C cycles/limb cycles/limb 41C P4 model 4 (Nocona): 8 42C K8: 7.5 43C K10: 3.5 44C P6 core2: 3.68 45C P6 corei7: 3.15 46 47C TODO 48C * Make a mpn_hamdist based on this. Alignment could either be handled by 49C using movdqu for one operand and movdqa for the other, or by painfully 50C shifting as we go. Unfortunately, there seem to be no useable shift 51C instruction, except for one that takes an immediate count. 52C * It would probably be possible to cut a few cycles/limb using software 53C pipelining. 54C * There are 35 decode slots unused by the SSE2 instructions. Loop control 55C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel 56C integer based popcount. Such a combined loop would handle 6 limbs in 57C about 30 cycles on K8. 58C * We could save a byte or two by using 32-bit operations on areg. 59C * Check if using movdqa to a temp of and then register-based pand is faster. 60 61ifelse(GMP_LIMB_BITS,`32', 62` define(`up', `%edx') 63 define(`n', `%ecx') 64 define(`areg',`%eax') 65 define(`breg',`%ebx') 66 define(`zero',`%xmm4') 67 define(`LIMB32',` $1') 68 define(`LIMB64',`dnl') 69',` 70 define(`up', `%rdi') 71 define(`n', `%rsi') 72 define(`areg',`%rax') 73 define(`breg',`%rdx') 74 define(`zero',`%xmm8') 75 define(`LIMB32',`dnl') 76 define(`LIMB64',` $1') 77') 78 79define(`mm01010101',`%xmm6') 80define(`mm00110011',`%xmm7') 81define(`mm00001111',`%xmm2') 82 83define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 84define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) 85define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) 86 87undefine(`psadbw') C override inherited m4 version 88 89ASM_START() 90 91C Make cnsts global to work around Apple relocation bug. 92ifdef(`DARWIN',` 93 define(`cnsts', MPN(popccnsts)) 94 GLOBL cnsts') 95 96 TEXT 97 ALIGN(32) 98PROLOGUE(mpn_popcount) 99 100LIMB32(`mov 4(%esp), up ') 101LIMB32(`mov 8(%esp), n ') 102LIMB32(`push %ebx ') 103 104 pxor %xmm3, %xmm3 C zero grand total count 105LIMB64(`pxor zero, zero ') 106ifdef(`PIC',` 107 LEA( cnsts, breg) 108',` 109LIMB32(`mov $cnsts, breg ') 110LIMB64(`movabs $cnsts, breg ') 111') 112 113 movdqa -48(breg), mm01010101 114 movdqa -32(breg), mm00110011 115 movdqa -16(breg), mm00001111 116 117 mov up, areg 118 and $-16, up C round `up' down to 128-bit boundary 119 and $12, areg C 32:areg = 0, 4, 8, 12 120 C 64:areg = 0, 8 121 movdqa (up), %xmm0 122 pand 64(breg,areg,4), %xmm0 123 shr $m4_log2(GMP_LIMB_BYTES), %eax 124 add areg, n C compensate n for rounded down `up' 125 126 pxor %xmm4, %xmm4 127 sub $LIMBS_PER_XMM, n 128 jbe L(sum) 129 130 sub $LIMBS_PER_XMM, n 131 ja L(ent) 132 jmp L(lsum) 133 134 ALIGN(16) 135L(top): movdqa (up), %xmm0 136L(ent): movdqa 16(up), %xmm4 137 138 movdqa %xmm0, %xmm1 139 movdqa %xmm4, %xmm5 140 psrld $1, %xmm0 141 psrld $1, %xmm4 142 pand mm01010101, %xmm0 143 pand mm01010101, %xmm4 144 psubd %xmm0, %xmm1 145 psubd %xmm4, %xmm5 146 147 movdqa %xmm1, %xmm0 148 movdqa %xmm5, %xmm4 149 psrlq $2, %xmm1 150 psrlq $2, %xmm5 151 pand mm00110011, %xmm0 152 pand mm00110011, %xmm4 153 pand mm00110011, %xmm1 154 pand mm00110011, %xmm5 155 paddq %xmm0, %xmm1 156 paddq %xmm4, %xmm5 157 158LIMB32(`pxor zero, zero ') 159 160 add $32, up 161 sub $LIMBS_PER_2XMM, n 162 163 paddq %xmm5, %xmm1 164 movdqa %xmm1, %xmm0 165 psrlq $4, %xmm1 166 pand mm00001111, %xmm0 167 pand mm00001111, %xmm1 168 paddq %xmm0, %xmm1 169 170 psadbw zero, %xmm1 171 paddq %xmm1, %xmm3 C add to grand total 172 173 jnc L(top) 174L(end): 175 add $LIMBS_PER_2XMM, n 176 jz L(rt) 177 movdqa (up), %xmm0 178 pxor %xmm4, %xmm4 179 sub $LIMBS_PER_XMM, n 180 jbe L(sum) 181L(lsum): 182 movdqa %xmm0, %xmm4 183 movdqa 16(up), %xmm0 184L(sum): 185 shl $m4_log2(GMP_LIMB_BYTES), n 186 and $12, n 187 pand (breg,n,4), %xmm0 188 189 movdqa %xmm0, %xmm1 190 movdqa %xmm4, %xmm5 191 psrld $1, %xmm0 192 psrld $1, %xmm4 193 pand mm01010101, %xmm0 194 pand mm01010101, %xmm4 195 psubd %xmm0, %xmm1 196 psubd %xmm4, %xmm5 197 198 movdqa %xmm1, %xmm0 199 movdqa %xmm5, %xmm4 200 psrlq $2, %xmm1 201 psrlq $2, %xmm5 202 pand mm00110011, %xmm0 203 pand mm00110011, %xmm4 204 pand mm00110011, %xmm1 205 pand mm00110011, %xmm5 206 paddq %xmm0, %xmm1 207 paddq %xmm4, %xmm5 208 209LIMB32(`pxor zero, zero ') 210 211 paddq %xmm5, %xmm1 212 movdqa %xmm1, %xmm0 213 psrlq $4, %xmm1 214 pand mm00001111, %xmm0 215 pand mm00001111, %xmm1 216 paddq %xmm0, %xmm1 217 218 psadbw zero, %xmm1 219 paddq %xmm1, %xmm3 C add to grand total 220 221 222C Add the two 64-bit halves of the grand total counter 223L(rt): movdqa %xmm3, %xmm0 224 psrldq $8, %xmm3 225 paddq %xmm3, %xmm0 226 movd %xmm0, areg C movq avoided due to gas bug 227 228LIMB32(`pop %ebx ') 229 ret 230 231EPILOGUE() 232DEF_OBJECT(dummy,16) 233C Three magic constants used for masking out bits 234 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 235 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 236 237 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 238 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 239 240 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 241 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 242cnsts: 243C Masks for high end of number 244 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 245 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 246 247 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 248 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 249 250 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 251 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 252 253 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 254 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 255C Masks for low end of number 256 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 257 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 258 259 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 260 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 261 262 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 263 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 264 265 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 266 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 267END_OBJECT(dummy) 268