1dnl AMD64 SSSE3/XOP mpn_popcount -- population count. 2 3dnl Copyright 2010-2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb good for cpu? 35C AMD K8,K9 n/a 36C AMD K10 n/a 37C AMD bd1 1.27 y 38C AMD bd2 1.24 y 39C AMD bd3 ? 40C AMD bd4 1.22 41C AMD zen n/a 42C AMD bobcat n/a 43C AMD jaguar n/a 44C Intel P4 n/a 45C Intel CNR n/a 46C Intel PNR n/a 47C Intel NHM n/a 48C Intel SBR n/a 49C Intel IBR n/a 50C Intel HWL n/a 51C Intel BWL n/a 52C Intel SKL n/a 53C Intel atom n/a 54C Intel SLM n/a 55C VIA nano n/a 56 57C TODO 58C * We need to use .byte for vpshlb, vpperm, vphaddubq, and all popcnt if we 59C intend to support old systems. 60 61C We use vpshlb and vpperm below, which are XOP extensions to AVX. Some 62C systems, e.g., NetBSD, set OSXSAVE but nevertheless trigger SIGILL for AVX. 63C We fall back to the core2 code. 64ifdef(`GMP_AVX_NOT_REALLY_AVAILABLE',` 65MULFUNC_PROLOGUE(mpn_popcount) 66include_mpn(`x86_64/core2/popcount.asm') 67',` 68 69define(`up', `%rdi') 70define(`n', `%rsi') 71 72ABI_SUPPORT(DOS64) 73ABI_SUPPORT(STD64) 74 75ASM_START() 76 TEXT 77 ALIGN(32) 78PROLOGUE(mpn_popcount) 79 FUNC_ENTRY(3) 80 lea L(cnsts)(%rip), %r9 81 82ifdef(`PIC', `define(`OFF1',32) define(`OFF2',48) define(`OFF3',64)', 83 `define(`OFF1',64) define(`OFF2',80) define(`OFF3',96)') 84 movdqa OFF1`'(%r9), %xmm7 C nibble counts table 85 movdqa OFF2`'(%r9), %xmm6 C splat shift counts 86 movdqa OFF3`'(%r9), %xmm9 C masks 87 pxor %xmm4, %xmm4 88 pxor %xmm5, %xmm5 C 0-reg 89 pxor %xmm8, %xmm8 C grand total count 90 91 xor R32(%rdx), R32(%rdx) 92 93 mov R32(n), R32(%rax) 94 and $7, R32(%rax) 95ifdef(`PIC',` 96 movslq (%r9,%rax,4), %rax 97 add %r9, %rax 98 jmp *%rax 99',` 100 jmp *(%r9,%rax,8) 101') 102 103L(1): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up),%rdx 104 add $8, up 105 dec n 106 jnz L(top) 107 mov %rdx, %rax 108 FUNC_EXIT() 109 ret 110 111L(2): add $-48, up 112 jmp L(e2) 113 114L(3): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx 115 add $-40, up 116 jmp L(e2) 117 118L(4): add $-32, up 119 jmp L(e4) 120 121L(5): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx 122 add $-24, up 123 jmp L(e4) 124 125L(6): add $-16, up 126 jmp L(e6) 127 128L(7): .byte 0xf3,0x48,0x0f,0xb8,0x17 C popcnt (up), %rdx 129 add $-8, up 130 jmp L(e6) 131 132 ALIGN(32) 133L(top): lddqu (up), %xmm0 134 .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 135 pand %xmm9, %xmm0 136 pand %xmm9, %xmm1 137 .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 138 .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1, %xmm7, %xmm7, %xmm3 139 paddb %xmm2, %xmm3 140 paddb %xmm3, %xmm4 141L(e6): lddqu 16(up), %xmm0 142 .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 143 pand %xmm9, %xmm0 144 pand %xmm9, %xmm1 145 .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0,%xmm7,%xmm7,%xmm2 146 .byte 0x8f,0xe8,0x40,0xa3,0xdf,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm3 147 paddb %xmm2, %xmm3 148 paddb %xmm3, %xmm4 149L(e4): lddqu 32(up), %xmm0 150 .byte 0x8f,0xe9,0x48,0x94,0xc8 C vpshlb %xmm6, %xmm0, %xmm1 151 pand %xmm9, %xmm0 152 pand %xmm9, %xmm1 153 .byte 0x8f,0xe8,0x40,0xa3,0xd7,0x00 C vpperm %xmm0, %xmm7, %xmm7, %xmm2 154 .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 155 .byte 0x8f,0xe8,0x40,0xa3,0xe7,0x10 C vpperm %xmm1,%xmm7,%xmm7,%xmm4 156 paddb %xmm2, %xmm4 157L(e2): popcnt 48(up), %r8 158 popcnt 56(up), %r9 159 add $64, up 160 paddq %xmm5, %xmm8 C sum to 2 x 64-bit counts 161 add %r8, %rdx 162 add %r9, %rdx 163 sub $8, n 164 jg L(top) 165 166 .byte 0x8f,0xe9,0x78,0xd3,0xec C vphaddubq %xmm4, %xmm5 167 paddq %xmm5, %xmm8 168 pshufd $14, %xmm8, %xmm0 169 paddq %xmm8, %xmm0 170 movq %xmm0, %rax 171 add %rdx, %rax 172 FUNC_EXIT() 173 ret 174EPILOGUE() 175DEF_OBJECT(L(cnsts),16,`JUMPTABSECT') 176 JMPENT( L(top), L(cnsts)) 177 JMPENT( L(1), L(cnsts)) 178 JMPENT( L(2), L(cnsts)) 179 JMPENT( L(3), L(cnsts)) 180 JMPENT( L(4), L(cnsts)) 181 JMPENT( L(5), L(cnsts)) 182 JMPENT( L(6), L(cnsts)) 183 JMPENT( L(7), L(cnsts)) 184 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 185 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 186 .byte -4,-4,-4,-4,-4,-4,-4,-4 187 .byte -4,-4,-4,-4,-4,-4,-4,-4 188 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 189 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 190END_OBJECT(L(cnsts)) 191') 192