1dnl AMD64 mpn_popcount, mpn_hamdist -- population count and hamming distance. 2 3dnl Copyright 2004, 2005, 2007, 2010-2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34 35C popcount hamdist 36C cycles/limb cycles/limb 37C AMD K8,K9 6 7 38C AMD K10 6 7 39C Intel P4 12 14.3 40C Intel core2 7 8 41C Intel corei ? 7.3 42C Intel atom 16.5 17.5 43C VIA nano 8.75 10.4 44 45C TODO 46C * Tune. It should be possible to reach 5 c/l for popcount and 6 c/l for 47C hamdist for K8/K9. 48 49 50ifdef(`OPERATION_popcount',` 51 define(`func',`mpn_popcount') 52 define(`up', `%rdi') 53 define(`n', `%rsi') 54 define(`h55555555', `%r10') 55 define(`h33333333', `%r11') 56 define(`h0f0f0f0f', `%rcx') 57 define(`h01010101', `%rdx') 58 define(`POP', `$1') 59 define(`HAM', `dnl') 60') 61ifdef(`OPERATION_hamdist',` 62 define(`func',`mpn_hamdist') 63 define(`up', `%rdi') 64 define(`vp', `%rsi') 65 define(`n', `%rdx') 66 define(`h55555555', `%r10') 67 define(`h33333333', `%r11') 68 define(`h0f0f0f0f', `%rcx') 69 define(`h01010101', `%r14') 70 define(`POP', `dnl') 71 define(`HAM', `$1') 72') 73 74 75MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist) 76 77ABI_SUPPORT(DOS64) 78ABI_SUPPORT(STD64) 79 80ASM_START() 81 TEXT 82 ALIGN(32) 83PROLOGUE(func) 84 POP(` FUNC_ENTRY(2) ') 85 HAM(` FUNC_ENTRY(3) ') 86 push %r12 87 push %r13 88 HAM(` push %r14 ') 89 90 mov $0x5555555555555555, h55555555 91 mov $0x3333333333333333, h33333333 92 mov $0x0f0f0f0f0f0f0f0f, h0f0f0f0f 93 mov $0x0101010101010101, h01010101 94 95 lea (up,n,8), up 96 HAM(` lea (vp,n,8), vp ') 97 neg n 98 99 xor R32(%rax), R32(%rax) 100 101 bt $0, R32(n) 102 jnc L(top) 103 104 mov (up,n,8), %r8 105 HAM(` xor (vp,n,8), %r8 ') 106 107 mov %r8, %r9 108 shr %r8 109 and h55555555, %r8 110 sub %r8, %r9 111 112 mov %r9, %r8 113 shr $2, %r9 114 and h33333333, %r8 115 and h33333333, %r9 116 add %r8, %r9 C 16 4-bit fields (0..4) 117 118 mov %r9, %r8 119 shr $4, %r9 120 and h0f0f0f0f, %r8 121 and h0f0f0f0f, %r9 122 add %r8, %r9 C 8 8-bit fields (0..16) 123 124 imul h01010101, %r9 C sum the 8 fields in high 8 bits 125 shr $56, %r9 126 127 mov %r9, %rax C add to total 128 add $1, n 129 jz L(end) 130 131 ALIGN(16) 132L(top): mov (up,n,8), %r8 133 mov 8(up,n,8), %r12 134 HAM(` xor (vp,n,8), %r8 ') 135 HAM(` xor 8(vp,n,8), %r12 ') 136 137 mov %r8, %r9 138 mov %r12, %r13 139 shr %r8 140 shr %r12 141 and h55555555, %r8 142 and h55555555, %r12 143 sub %r8, %r9 144 sub %r12, %r13 145 146 mov %r9, %r8 147 mov %r13, %r12 148 shr $2, %r9 149 shr $2, %r13 150 and h33333333, %r8 151 and h33333333, %r9 152 and h33333333, %r12 153 and h33333333, %r13 154 add %r8, %r9 C 16 4-bit fields (0..4) 155 add %r12, %r13 C 16 4-bit fields (0..4) 156 157 add %r13, %r9 C 16 4-bit fields (0..8) 158 mov %r9, %r8 159 shr $4, %r9 160 and h0f0f0f0f, %r8 161 and h0f0f0f0f, %r9 162 add %r8, %r9 C 8 8-bit fields (0..16) 163 164 imul h01010101, %r9 C sum the 8 fields in high 8 bits 165 shr $56, %r9 166 167 add %r9, %rax C add to total 168 add $2, n 169 jnc L(top) 170 171L(end): 172 HAM(` pop %r14 ') 173 pop %r13 174 pop %r12 175 FUNC_EXIT() 176 ret 177EPILOGUE() 178