1dnl X86-32 and X86-64 mpn_popcount using SSE2. 2 3dnl Copyright 2006, 2007, 2011, 2015, 2020 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34 35C 32-bit popcount hamdist 36C cycles/limb cycles/limb 37C P5 - 38C P6 model 0-8,10-12 - 39C P6 model 9 (Banias) ? 40C P6 model 13 (Dothan) 4 41C P4 model 0 (Willamette) ? 42C P4 model 1 (?) ? 43C P4 model 2 (Northwood) 3.9 44C P4 model 3 (Prescott) ? 45C P4 model 4 (Nocona) ? 46C AMD K6 - 47C AMD K7 - 48C AMD K8 ? 49 50C 64-bit popcount hamdist 51C cycles/limb cycles/limb 52C P4 model 4 (Nocona): 8 53C AMD K8,K9 7.5 54C AMD K10 3.5 55C Intel core2 3.68 56C Intel corei 3.15 57C Intel atom 10.8 58C VIA nano 6.5 59 60C TODO 61C * Make an mpn_hamdist based on this. Alignment could either be handled by 62C using movdqu for one operand and movdqa for the other, or by painfully 63C shifting as we go. Unfortunately, there seem to be no usable shift 64C instruction, except for one that takes an immediate count. 65C * It would probably be possible to cut a few cycles/limb using software 66C pipelining. 67C * There are 35 decode slots unused by the SSE2 instructions. Loop control 68C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel 69C integer based popcount. Such a combined loop would handle 6 limbs in 70C about 30 cycles on K8. 71C * We could save a byte or two by using 32-bit operations on areg. 72C * Check if using movdqa to a temp of and then register-based pand is faster. 73 74ifelse(GMP_LIMB_BITS,`32', 75` define(`up', `%edx') 76 define(`n', `%ecx') 77 define(`areg',`%eax') 78 define(`breg',`%ebx') 79 define(`zero',`%xmm4') 80 define(`LIMB32',` $1') 81 define(`LIMB64',`dnl') 82',` 83 define(`up', `%rdi') 84 define(`n', `%rsi') 85 define(`areg',`%rax') 86 define(`breg',`%rdx') 87 define(`zero',`%xmm8') 88 define(`LIMB32',`dnl') 89 define(`LIMB64',` $1') 90') 91 92define(`mm01010101',`%xmm6') 93define(`mm00110011',`%xmm7') 94define(`mm00001111',`%xmm2') 95 96define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 97define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES)) 98define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES)) 99 100undefine(`psadbw') C override inherited m4 version 101 102C This file is shared between 32-bit and 64-bit builds. Only the former has 103C LEAL. Default LEAL as an alias of LEA. 104ifdef(`LEAL',,`define(`LEAL', `LEA($1,$2)')') 105 106ASM_START() 107 108C Make cnsts global to work around Apple relocation bug. 109ifdef(`DARWIN',` 110 define(`cnsts', MPN(popccnsts)) 111 GLOBL cnsts') 112 113 TEXT 114 ALIGN(32) 115PROLOGUE(mpn_popcount) 116 117LIMB32(`mov 4(%esp), up ') 118LIMB32(`mov 8(%esp), n ') 119LIMB32(`push %ebx ') 120 121 pxor %xmm3, %xmm3 C zero grand total count 122LIMB64(`pxor zero, zero ') 123 124 LEAL( cnsts, breg) 125 126 movdqa -48(breg), mm01010101 127 movdqa -32(breg), mm00110011 128 movdqa -16(breg), mm00001111 129 130 mov up, areg 131 and $-16, up C round `up' down to 128-bit boundary 132 and $12, areg C 32:areg = 0, 4, 8, 12 133 C 64:areg = 0, 8 134 movdqa (up), %xmm0 135 pand 64(breg,areg,4), %xmm0 136 shr $m4_log2(GMP_LIMB_BYTES), %eax 137 add areg, n C compensate n for rounded down `up' 138 139 pxor %xmm4, %xmm4 140 sub $LIMBS_PER_XMM, n 141 jbe L(sum) 142 143 sub $LIMBS_PER_XMM, n 144 ja L(ent) 145 jmp L(lsum) 146 147 ALIGN(16) 148L(top): movdqa (up), %xmm0 149L(ent): movdqa 16(up), %xmm4 150 151 movdqa %xmm0, %xmm1 152 movdqa %xmm4, %xmm5 153 psrld $1, %xmm0 154 psrld $1, %xmm4 155 pand mm01010101, %xmm0 156 pand mm01010101, %xmm4 157 psubd %xmm0, %xmm1 158 psubd %xmm4, %xmm5 159 160 movdqa %xmm1, %xmm0 161 movdqa %xmm5, %xmm4 162 psrlq $2, %xmm1 163 psrlq $2, %xmm5 164 pand mm00110011, %xmm0 165 pand mm00110011, %xmm4 166 pand mm00110011, %xmm1 167 pand mm00110011, %xmm5 168 paddq %xmm0, %xmm1 169 paddq %xmm4, %xmm5 170 171LIMB32(`pxor zero, zero ') 172 173 add $32, up 174 sub $LIMBS_PER_2XMM, n 175 176 paddq %xmm5, %xmm1 177 movdqa %xmm1, %xmm0 178 psrlq $4, %xmm1 179 pand mm00001111, %xmm0 180 pand mm00001111, %xmm1 181 paddq %xmm0, %xmm1 182 183 psadbw zero, %xmm1 184 paddq %xmm1, %xmm3 C add to grand total 185 186 jnc L(top) 187L(end): 188 add $LIMBS_PER_2XMM, n 189 jz L(rt) 190 movdqa (up), %xmm0 191 pxor %xmm4, %xmm4 192 sub $LIMBS_PER_XMM, n 193 jbe L(sum) 194L(lsum): 195 movdqa %xmm0, %xmm4 196 movdqa 16(up), %xmm0 197L(sum): 198 shl $m4_log2(GMP_LIMB_BYTES), n 199 and $12, n 200 pand (breg,n,4), %xmm0 201 202 movdqa %xmm0, %xmm1 203 movdqa %xmm4, %xmm5 204 psrld $1, %xmm0 205 psrld $1, %xmm4 206 pand mm01010101, %xmm0 207 pand mm01010101, %xmm4 208 psubd %xmm0, %xmm1 209 psubd %xmm4, %xmm5 210 211 movdqa %xmm1, %xmm0 212 movdqa %xmm5, %xmm4 213 psrlq $2, %xmm1 214 psrlq $2, %xmm5 215 pand mm00110011, %xmm0 216 pand mm00110011, %xmm4 217 pand mm00110011, %xmm1 218 pand mm00110011, %xmm5 219 paddq %xmm0, %xmm1 220 paddq %xmm4, %xmm5 221 222LIMB32(`pxor zero, zero ') 223 224 paddq %xmm5, %xmm1 225 movdqa %xmm1, %xmm0 226 psrlq $4, %xmm1 227 pand mm00001111, %xmm0 228 pand mm00001111, %xmm1 229 paddq %xmm0, %xmm1 230 231 psadbw zero, %xmm1 232 paddq %xmm1, %xmm3 C add to grand total 233 234 235C Add the two 64-bit halves of the grand total counter 236L(rt): movdqa %xmm3, %xmm0 237 psrldq $8, %xmm3 238 paddq %xmm3, %xmm0 239 movd %xmm0, areg C movq avoided due to gas bug 240 241LIMB32(`pop %ebx ') 242 ret 243 244EPILOGUE() 245DEF_OBJECT(dummy,16) 246C Three magic constants used for masking out bits 247 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 248 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 249 250 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 251 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 252 253 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 254 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f 255cnsts: 256C Masks for high end of number 257 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 258 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 259 260 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 261 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 262 263 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 264 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 265 266 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 267 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 268C Masks for low end of number 269 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 270 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 271 272 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 273 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 274 275 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 276 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 277 278 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 279 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 280END_OBJECT(dummy) 281ASM_END() 282