1dnl AMD64 mpn_popcount -- population count. 2 3dnl Copyright 2008, 2010, 2011, 2012 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C AMD K8,K9 n/a 24C AMD K10 1.125 25C Intel P4 n/a 26C Intel core2 n/a 27C Intel corei 1.25 28C Intel atom n/a 29C VIA nano n/a 30 31C * The zero-offset of popcount is misassembled to the offset-less form, which 32C is one byte shorter and therefore will mess up the switching code. 33C * The outdated gas used in FreeBSD and NetBSD cannot handle the POPCNT insn, 34C which is the main reason for our usage of '.byte'. 35 36C TODO 37C * Improve switching code, the current code sucks. 38 39define(`up', `%rdi') 40define(`n', `%rsi') 41 42ABI_SUPPORT(DOS64) 43ABI_SUPPORT(STD64) 44 45ASM_START() 46 TEXT 47 ALIGN(32) 48PROLOGUE(mpn_popcount) 49 FUNC_ENTRY(2) 50 51ifelse(1,1,` 52 lea (up,n,8), up 53 54C mov R32(n), R32(%rcx) 55C neg R32(%rcx) 56 imul $-1, R32(n), R32(%rcx) 57 and $8-1, R32(%rcx) 58 59 neg n 60 61 mov R32(%rcx), R32(%rax) 62 neg %rax 63 lea (up,%rax,8),up 64 65 xor R32(%rax), R32(%rax) 66 67 lea (%rcx,%rcx,4), %rcx 68 69 lea L(top)(%rip), %rdx 70 lea (%rdx,%rcx,2), %rdx 71 jmp *%rdx 72',` 73 lea (up,n,8), up 74 75 mov R32(n), R32(%rcx) 76 neg R32(%rcx) 77 and $8-1, R32(%rcx) 78 79 neg n 80 81 mov R32(%rcx), R32(%rax) 82 shl $3, R32(%rax) 83 sub %rax, up 84 85 xor R32(%rax), R32(%rax) 86 87C add R32(%rcx), R32(%rcx) C 2x 88C lea (%rcx,%rcx,4), %rcx C 10x 89 imul $10, R32(%rcx) 90 91 lea L(top)(%rip), %rdx 92 add %rcx, %rdx 93 jmp *%rdx 94') 95 96 ALIGN(32) 97L(top): 98C 0 = n mod 8 99 .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x00 C popcnt 0(up,n,8), %r8 100 add %r8, %rax 101C 7 = n mod 8 102 .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x08 C popcnt 8(up,n,8), %r9 103 add %r9, %rax 104C 6 = n mod 8 105 .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x10 C popcnt 16(up,n,8), %r8 106 add %r8, %rax 107C 5 = n mod 8 108 .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x18 C popcnt 24(up,n,8), %r9 109 add %r9, %rax 110C 4 = n mod 8 111 .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x20 C popcnt 32(up,n,8), %r8 112 add %r8, %rax 113C 3 = n mod 8 114 .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x28 C popcnt 40(up,n,8), %r9 115 add %r9, %rax 116C 2 = n mod 8 117 .byte 0xf3,0x4c,0x0f,0xb8,0x44,0xf7,0x30 C popcnt 48(up,n,8), %r8 118 add %r8, %rax 119C 1 = n mod 8 120 .byte 0xf3,0x4c,0x0f,0xb8,0x4c,0xf7,0x38 C popcnt 56(up,n,8), %r9 121 add %r9, %rax 122 123 add $8, n 124 js L(top) 125 FUNC_EXIT() 126 ret 127EPILOGUE() 128