1dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and 2dnl hamming distance. 3 4dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C popcount hamdist 25C P3 model 9 (Banias) ? ? 26C P3 model 13 (Dothan) 6 6 27C P4 model 0 (Willamette) 28C P4 model 1 (?) 29C P4 model 2 (Northwood) 8 9 30C P4 model 3 (Prescott) 8 9 31C P4 model 4 (Nocona) 32 33C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size); 34C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size); 35C 36C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided. 37C Two movd's and a punpckldq seems to be the same speed as an aligned movq, 38C and using them saves fiddling about with alignment testing on entry. 39C 40C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l 41C might be possible, but 8 c/l relying on out-of-order execution is already 42C quite reasonable. 43 44ifdef(`OPERATION_popcount',, 45`ifdef(`OPERATION_hamdist',, 46`m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined 47')')') 48 49define(HAM, 50m4_assert_numargs(1) 51`ifdef(`OPERATION_hamdist',`$1')') 52 53define(POP, 54m4_assert_numargs(1) 55`ifdef(`OPERATION_popcount',`$1')') 56 57HAM(` 58defframe(PARAM_SIZE, 12) 59defframe(PARAM_SRC2, 8) 60defframe(PARAM_SRC, 4) 61define(M4_function,mpn_hamdist) 62') 63POP(` 64defframe(PARAM_SIZE, 8) 65defframe(PARAM_SRC, 4) 66define(M4_function,mpn_popcount) 67') 68 69MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 70 71 72ifdef(`PIC',,` 73 dnl non-PIC 74 RODATA 75 ALIGN(8) 76L(rodata_AAAAAAAAAAAAAAAA): 77 .long 0xAAAAAAAA 78 .long 0xAAAAAAAA 79L(rodata_3333333333333333): 80 .long 0x33333333 81 .long 0x33333333 82L(rodata_0F0F0F0F0F0F0F0F): 83 .long 0x0F0F0F0F 84 .long 0x0F0F0F0F 85') 86 87 TEXT 88 ALIGN(16) 89 90PROLOGUE(M4_function) 91deflit(`FRAME',0) 92 93 movl PARAM_SIZE, %ecx 94 movl PARAM_SRC, %eax 95 96ifdef(`PIC',` 97 movl $0xAAAAAAAA, %edx 98 movd %edx, %mm7 99 punpckldq %mm7, %mm7 100 101 movl $0x33333333, %edx 102 movd %edx, %mm6 103 punpckldq %mm6, %mm6 104 105 movl $0x0F0F0F0F, %edx 106 movd %edx, %mm5 107 punpckldq %mm5, %mm5 108 109HAM(` movl PARAM_SRC2, %edx') 110 111',` 112 dnl non-PIC 113HAM(` movl PARAM_SRC2, %edx') 114 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7 115 movq L(rodata_3333333333333333), %mm6 116 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5 117') 118 119 pxor %mm4, %mm4 C zero 120 pxor %mm0, %mm0 C total 121 122 subl $1, %ecx 123 ja L(top) 124 125L(last): 126 movd (%eax,%ecx,4), %mm1 C src high limb 127HAM(` movd (%edx,%ecx,4), %mm2 128 pxor %mm2, %mm1 129') 130 jmp L(loaded) 131 132 133L(top): 134 C eax src 135 C ebx 136 C ecx counter, size-1 to 2 or 1, inclusive 137 C edx [hamdist] src2 138 C 139 C mm0 total (low dword) 140 C mm1 (scratch) 141 C mm2 (scratch) 142 C mm3 143 C mm4 0x0000000000000000 144 C mm5 0x0F0F0F0F0F0F0F0F 145 C mm6 0x3333333333333333 146 C mm7 0xAAAAAAAAAAAAAAAA 147 148 movd (%eax), %mm1 149 movd 4(%eax), %mm2 150 punpckldq %mm2, %mm1 151 addl $8, %eax 152 153HAM(` movd (%edx), %mm2 154 movd 4(%edx), %mm3 155 punpckldq %mm3, %mm2 156 pxor %mm2, %mm1 157 addl $8, %edx 158') 159 160L(loaded): 161 movq %mm7, %mm2 162 pand %mm1, %mm2 163 psrlq $1, %mm2 164 psubd %mm2, %mm1 C bit pairs 165 166 movq %mm6, %mm2 167 pand %mm1, %mm2 168 psrlq $2, %mm1 169 pand %mm6, %mm1 170 paddd %mm2, %mm1 C nibbles 171 172 movq %mm5, %mm2 173 pand %mm1, %mm2 174 psrlq $4, %mm1 175 pand %mm5, %mm1 176 paddd %mm2, %mm1 C bytes 177 178 psadbw( %mm4, %mm1) 179 paddd %mm1, %mm0 C to total 180 181 subl $2, %ecx 182 jg L(top) 183 184 C ecx is 0 or -1 representing respectively 1 or 0 further limbs 185 jz L(last) 186 187 188 movd %mm0, %eax 189 emms 190 ret 191 192EPILOGUE() 193