1dnl AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming 2dnl distance. 3 4dnl Copyright 2000-2002 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34 35C popcount hamdist 36C P3 generic 6.5 7 37C P3 model 9 (Banias) 5.7 6.1 38C P3 model 13 (Dothan) 5.75 6 39C K7 5 6 40 41C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 42C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 43C 44C The code here is almost certainly not optimal, but is already a 3x speedup 45C over the generic C code. The main improvement would be to interleave 46C processing of two qwords in the loop so as to fully exploit the available 47C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs). 48C 49C The loop is based on the example "Efficient 64-bit population count using 50C MMX instructions" in the Athlon Optimization Guide, AMD document 22007, 51C page 158 of rev E (reference in mpn/x86/k7/README). 52 53ifdef(`OPERATION_popcount',, 54`ifdef(`OPERATION_hamdist',, 55`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined 56')')') 57 58define(HAM, 59m4_assert_numargs(1) 60`ifdef(`OPERATION_hamdist',`$1')') 61 62define(POP, 63m4_assert_numargs(1) 64`ifdef(`OPERATION_popcount',`$1')') 65 66HAM(` 67defframe(PARAM_SIZE, 12) 68defframe(PARAM_SRC2, 8) 69defframe(PARAM_SRC, 4) 70define(M4_function,mpn_hamdist) 71') 72POP(` 73defframe(PARAM_SIZE, 8) 74defframe(PARAM_SRC, 4) 75define(M4_function,mpn_popcount) 76') 77 78MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 79 80 81ifdef(`PIC',,` 82 dnl non-PIC 83 84 RODATA 85 ALIGN(8) 86 87L(rodata_AAAAAAAAAAAAAAAA): 88 .long 0xAAAAAAAA 89 .long 0xAAAAAAAA 90 91L(rodata_3333333333333333): 92 .long 0x33333333 93 .long 0x33333333 94 95L(rodata_0F0F0F0F0F0F0F0F): 96 .long 0x0F0F0F0F 97 .long 0x0F0F0F0F 98') 99 100 TEXT 101 ALIGN(32) 102 103PROLOGUE(M4_function) 104deflit(`FRAME',0) 105 106 movl PARAM_SIZE, %ecx 107 108ifdef(`PIC',` 109 movl $0xAAAAAAAA, %eax 110 movl $0x33333333, %edx 111 112 movd %eax, %mm7 113 movd %edx, %mm6 114 115 movl $0x0F0F0F0F, %eax 116 117 punpckldq %mm7, %mm7 118 punpckldq %mm6, %mm6 119 120 movd %eax, %mm5 121 movd %edx, %mm4 122 123 punpckldq %mm5, %mm5 124 125',` 126 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 127 movq L(rodata_3333333333333333), %mm6 128 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 129') 130 pxor %mm4, %mm4 131 132define(REG_AAAAAAAAAAAAAAAA,%mm7) 133define(REG_3333333333333333,%mm6) 134define(REG_0F0F0F0F0F0F0F0F,%mm5) 135define(REG_0000000000000000,%mm4) 136 137 138 movl PARAM_SRC, %eax 139HAM(` movl PARAM_SRC2, %edx') 140 141 pxor %mm2, %mm2 C total 142 143 shrl %ecx 144 jnc L(top) 145 146 movd (%eax,%ecx,8), %mm1 147 148HAM(` movd (%edx,%ecx,8), %mm0 149 pxor %mm0, %mm1 150') 151 orl %ecx, %ecx 152 jmp L(loaded) 153 154 155 ALIGN(16) 156L(top): 157 C eax src 158 C ebx 159 C ecx counter, qwords, decrementing 160 C edx [hamdist] src2 161 C 162 C mm0 (scratch) 163 C mm1 (scratch) 164 C mm2 total (low dword) 165 C mm3 166 C mm4 \ 167 C mm5 | special constants 168 C mm6 | 169 C mm7 / 170 171 movq -8(%eax,%ecx,8), %mm1 172 173HAM(` pxor -8(%edx,%ecx,8), %mm1') 174 decl %ecx 175 176L(loaded): 177 movq %mm1, %mm0 178 pand REG_AAAAAAAAAAAAAAAA, %mm1 179 180 psrlq $1, %mm1 181 182 psubd %mm1, %mm0 C bit pairs 183 184 185 movq %mm0, %mm1 186 psrlq $2, %mm0 187 188 pand REG_3333333333333333, %mm0 189 pand REG_3333333333333333, %mm1 190 191 paddd %mm1, %mm0 C nibbles 192 193 194 movq %mm0, %mm1 195 psrlq $4, %mm0 196 197 pand REG_0F0F0F0F0F0F0F0F, %mm0 198 pand REG_0F0F0F0F0F0F0F0F, %mm1 199 200 paddd %mm1, %mm0 C bytes 201 202 203 psadbw( %mm4, %mm0) 204 205 paddd %mm0, %mm2 C add to total 206 jnz L(top) 207 208 209 movd %mm2, %eax 210 emms 211 ret 212 213EPILOGUE() 214