1dnl AMD64 SSSE3/XOP mpn_hamdist -- hamming distance. 2 3dnl Copyright 2010-2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb good for cpu? 35C AMD K8,K9 n/a 36C AMD K10 n/a 37C AMD bd1 1.51-2.0 y 38C AMD bd2 1.50-1.9 y 39C AMD bd3 ? 40C AMD bd4 ? 41C AMD zen n/a 42C AMD bobcat n/a 43C AMD jaguar n/a 44C Intel P4 n/a 45C Intel PNR n/a 46C Intel NHM n/a 47C Intel SBR n/a 48C Intel IBR n/a 49C Intel HWL n/a 50C Intel BWL n/a 51C Intel SKL n/a 52C Intel atom n/a 53C Intel SLM n/a 54C VIA nano n/a 55 56C TODO 57C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we 58C intend to support old systems. 59 60C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some 61C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. 62C We fall back to the core2 code. 63ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` 64MULFUNC_PROLOGUE(mpn_hamdist) 65include_mpn(`x86_64/core2/hamdist.asm') 66',` 67 68define(`up', `%rdi') 69define(`vp', `%rsi') 70define(`n', `%rdx') 71 72ABI_SUPPORT(DOS64) 73ABI_SUPPORT(STD64) 74 75ASM_START() 76 TEXT 77 ALIGN(32) 78PROLOGUE(mpn_hamdist) 79 FUNC_ENTRY(3) 80 cmp $5, n 81 jl L(sma) 82 83 lea L(cnsts)(%rip), %r9 84 85 xor R32(%r10), R32(%r10) 86 test $8, R8(vp) 87 jz L(ali) 88 mov (up), %r8 89 xor (vp), %r8 90 add $8, up 91 add $8, vp 92 dec n 93 popcnt %r8, %r10 94L(ali): 95 96ifdef(`PIC', `define(`OFF1',16) define(`OFF2',32) define(`OFF3',48)', 97 `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)') 98 movdqa OFF1`'(%r9), %xmm7 C nibble counts table 99 movdqa OFF2`'(%r9), %xmm6 C splat shift counts 100 movdqa OFF3`'(%r9), %xmm5 C masks 101 pxor %xmm4, %xmm4 102 pxor %xmm8, %xmm8 C grand total count 103 104 mov R32(n), R32(%rax) 105 and $6, R32(%rax) 106 lea -64(up,%rax,8), up 107 lea -64(vp,%rax,8), vp 108ifdef(`PIC',` 109 movslq (%r9,%rax,2), %r11 110 add %r9, %r11 111 jmp *%r11 112',` 113 jmp *(%r9,%rax,4) 114') 115 116L(0): add $64, up 117 add $64, vp 118 sub $2, n 119 120 ALIGN(32) 121L(top): lddqu (up), %xmm0 122 pxor (vp), %xmm0 123 .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 124 pand %xmm5, %xmm0 125 pand %xmm5, %xmm1 126 .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 127 .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 128 paddb %xmm2, %xmm3 129 paddb %xmm3, %xmm4 130L(6): lddqu 16(up), %xmm0 131 pxor 16(vp), %xmm0 132 .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 133 pand %xmm5, %xmm0 134 pand %xmm5, %xmm1 135 .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 136 .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 137 paddb %xmm2, %xmm3 138 paddb %xmm3, %xmm4 139L(4): lddqu 32(up), %xmm0 140 pxor 32(vp), %xmm0 141 .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 142 pand %xmm5, %xmm0 143 pand %xmm5, %xmm1 144 .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 145 .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 146 .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 147 paddb %xmm2, %xmm3 148 paddb %xmm2, %xmm4 149 paddq %xmm0, %xmm8 C sum to 2 x 64-bit counts 150L(2): mov 48(up), %r8 151 mov 56(up), %r9 152 add $64, up 153 xor 48(vp), %r8 154 xor 56(vp), %r9 155 add $64, vp 156 popcnt %r8, %r8 157 popcnt %r9, %r9 158 add %r8, %r10 159 add %r9, %r10 160 sub $8, n 161 jg L(top) 162 163 test $1, R8(n) 164 jz L(x) 165 mov (up), %r8 166 xor (vp), %r8 167 popcnt %r8, %r8 168 add %r8, %r10 169L(x): .byte 0x8f,0xe9,0x78,0xd3,0xc4 C vphaddubq %xmm4, %xmm0 170 paddq %xmm0, %xmm8 171 pshufd $14, %xmm8, %xmm0 172 paddq %xmm8, %xmm0 173 movq %xmm0, %rax 174 add %r10, %rax 175 FUNC_EXIT() 176 ret 177 178L(sma): mov (up), %r8 179 xor (vp), %r8 180 popcnt %r8, %rax 181 dec n 182 jz L(ed) 183L(tp): mov 8(up), %r8 184 add $8, up 185 xor 8(vp), %r8 186 add $8, vp 187 popcnt %r8, %r8 188 add %r8, %rax 189 dec n 190 jnz L(tp) 191L(ed): FUNC_EXIT() 192 ret 193EPILOGUE() 194DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') 195 JMPENT( L(0), L(cnsts)) 196 JMPENT( L(2), L(cnsts)) 197 JMPENT( L(4), L(cnsts)) 198 JMPENT( L(6), L(cnsts)) 199 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 200 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 201 .byte -4,-4,-4,-4,-4,-4,-4,-4 202 .byte -4,-4,-4,-4,-4,-4,-4,-4 203 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 204 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 205END_OBJECT(L(cnsts)) 206') 207