1dnl AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and 2dnl hamming distance. 3 4dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C popcount hamdist 25C K6-2: 9.0 11.5 cycles/limb 26C K6: 12.5 13.0 27 28 29C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 30C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 31C 32C The code here isn't optimal, but it's already a 2x speedup over the plain 33C integer mpn/generic/popcount.c,hamdist.c. 34 35 36ifdef(`OPERATION_popcount',, 37`ifdef(`OPERATION_hamdist',, 38`m4_error(`Need OPERATION_popcount or OPERATION_hamdist 39')m4exit(1)')') 40 41define(HAM, 42m4_assert_numargs(1) 43`ifdef(`OPERATION_hamdist',`$1')') 44 45define(POP, 46m4_assert_numargs(1) 47`ifdef(`OPERATION_popcount',`$1')') 48 49HAM(` 50defframe(PARAM_SIZE, 12) 51defframe(PARAM_SRC2, 8) 52defframe(PARAM_SRC, 4) 53define(M4_function,mpn_hamdist) 54') 55POP(` 56defframe(PARAM_SIZE, 8) 57defframe(PARAM_SRC, 4) 58define(M4_function,mpn_popcount) 59') 60 61MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 62 63 64ifdef(`PIC',,` 65 dnl non-PIC 66 67 RODATA 68 ALIGN(8) 69 70L(rodata_AAAAAAAAAAAAAAAA): 71 .long 0xAAAAAAAA 72 .long 0xAAAAAAAA 73 74L(rodata_3333333333333333): 75 .long 0x33333333 76 .long 0x33333333 77 78L(rodata_0F0F0F0F0F0F0F0F): 79 .long 0x0F0F0F0F 80 .long 0x0F0F0F0F 81 82L(rodata_000000FF000000FF): 83 .long 0x000000FF 84 .long 0x000000FF 85') 86 87 TEXT 88 ALIGN(32) 89 90POP(`ifdef(`PIC', ` 91 C avoid shrl crossing a 32-byte boundary 92 nop')') 93 94PROLOGUE(M4_function) 95deflit(`FRAME',0) 96 97 movl PARAM_SIZE, %ecx 98 99ifdef(`PIC',` 100 movl $0xAAAAAAAA, %eax 101 movl $0x33333333, %edx 102 103 movd %eax, %mm7 104 movd %edx, %mm6 105 106 movl $0x0F0F0F0F, %eax 107 movl $0x000000FF, %edx 108 109 punpckldq %mm7, %mm7 110 punpckldq %mm6, %mm6 111 112 movd %eax, %mm5 113 movd %edx, %mm4 114 115 punpckldq %mm5, %mm5 116 punpckldq %mm4, %mm4 117',` 118 119 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 120 movq L(rodata_3333333333333333), %mm6 121 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 122 movq L(rodata_000000FF000000FF), %mm4 123') 124 125define(REG_AAAAAAAAAAAAAAAA, %mm7) 126define(REG_3333333333333333, %mm6) 127define(REG_0F0F0F0F0F0F0F0F, %mm5) 128define(REG_000000FF000000FF, %mm4) 129 130 131 movl PARAM_SRC, %eax 132HAM(` movl PARAM_SRC2, %edx') 133 134 pxor %mm2, %mm2 C total 135 136 shrl %ecx 137 jnc L(top) 138 139Zdisp( movd, 0,(%eax,%ecx,8), %mm1) 140 141HAM(` 142Zdisp( movd, 0,(%edx,%ecx,8), %mm0) 143 pxor %mm0, %mm1 144') 145 146 incl %ecx 147 jmp L(loaded) 148 149 150 ALIGN(16) 151POP(` nop C alignment to avoid crossing 32-byte boundaries') 152 153L(top): 154 C eax src 155 C ebx 156 C ecx counter, qwords, decrementing 157 C edx [hamdist] src2 158 C 159 C mm0 (scratch) 160 C mm1 (scratch) 161 C mm2 total (low dword) 162 C mm3 163 C mm4 \ 164 C mm5 | special constants 165 C mm6 | 166 C mm7 / 167 168 movq -8(%eax,%ecx,8), %mm1 169HAM(` pxor -8(%edx,%ecx,8), %mm1') 170 171L(loaded): 172 movq %mm1, %mm0 173 pand REG_AAAAAAAAAAAAAAAA, %mm1 174 175 psrlq $1, %mm1 176HAM(` nop C code alignment') 177 178 psubd %mm1, %mm0 C bit pairs 179HAM(` nop C code alignment') 180 181 182 movq %mm0, %mm1 183 psrlq $2, %mm0 184 185 pand REG_3333333333333333, %mm0 186 pand REG_3333333333333333, %mm1 187 188 paddd %mm1, %mm0 C nibbles 189 190 191 movq %mm0, %mm1 192 psrlq $4, %mm0 193 194 pand REG_0F0F0F0F0F0F0F0F, %mm0 195 pand REG_0F0F0F0F0F0F0F0F, %mm1 196 197 paddd %mm1, %mm0 C bytes 198 199 movq %mm0, %mm1 200 psrlq $8, %mm0 201 202 203 paddb %mm1, %mm0 C words 204 205 206 movq %mm0, %mm1 207 psrlq $16, %mm0 208 209 paddd %mm1, %mm0 C dwords 210 211 pand REG_000000FF000000FF, %mm0 212 213 paddd %mm0, %mm2 C low to total 214 psrlq $32, %mm0 215 216 paddd %mm0, %mm2 C high to total 217 loop L(top) 218 219 220 221 movd %mm2, %eax 222 emms_or_femms 223 ret 224 225EPILOGUE() 226