1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount. 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C 7400,7410 (G4): 2.75 24C 744x,745x (G4+): 2.25 25C 970 (G5): 5.3 26 27C STATUS 28C * Works for all sizes and alignments. 29 30C TODO 31C * Tune the awkward huge n outer loop code. 32C * Two lvx, two vperm, and two vxor could make us a similar hamdist. 33C * For the 970, a combined VMX+intop approach might be best. 34C * Compress cnsts table in 64-bit mode, only half the values are needed. 35 36define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 37define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 38define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 39 40define(`OPERATION_popcount') 41 42ifdef(`OPERATION_popcount',` 43 define(`func',`mpn_popcount') 44 define(`up', `r3') 45 define(`n', `r4') 46 define(`HAM', `dnl') 47') 48ifdef(`OPERATION_hamdist',` 49 define(`func',`mpn_hamdist') 50 define(`up', `r3') 51 define(`vp', `r4') 52 define(`n', `r5') 53 define(`HAM', `$1') 54') 55 56define(`x01010101',`v2') 57define(`x00110011',`v7') 58define(`x00001111',`v10') 59define(`cnt1',`v11') 60define(`cnt2',`v12') 61define(`cnt4',`v13') 62 63ifelse(GMP_LIMB_BITS,32,` 64 define(`LIMB32',` $1') 65 define(`LIMB64',`') 66',` 67 define(`LIMB32',`') 68 define(`LIMB64',` $1') 69') 70 71C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow 72C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK. 73define(`LIMBS_PER_CHUNK', 0x1000) 74define(`LIMBS_CHUNK_THRES', 0x1001) 75 76ASM_START() 77PROLOGUE(mpn_popcount) 78 mfspr r10, 256 79 oris r0, r10, 0xfffc C Set VRSAVE bit 0-13 80 mtspr 256, r0 81 82ifdef(`HAVE_ABI_mode32', 83` rldicl n, n, 0, 32') C zero extend n 84 85C Load various constants into vector registers 86 LEAL( r11, cnsts) 87 li r12, 16 88 vspltisb cnt1, 1 C 0x0101...01 used as shift count 89 vspltisb cnt2, 2 C 0x0202...02 used as shift count 90 vspltisb cnt4, 4 C 0x0404...04 used as shift count 91 lvx x01010101, 0, r11 C 0x3333...33 92 lvx x00110011, r12, r11 C 0x5555...55 93 vspltisb x00001111, 15 C 0x0f0f...0f 94 95LIMB64(`lis r0, LIMBS_CHUNK_THRES ') 96LIMB64(`cmpd cr7, n, r0 ') 97 98 lvx v0, 0, up 99 addi r7, r11, 96 100 rlwinm r6, up, 2,26,29 101 lvx v8, r7, r6 102 vand v0, v0, v8 103 104LIMB32(`rlwinm r8, up, 30,30,31 ') 105LIMB64(`rlwinm r8, up, 29,31,31 ') 106 add n, n, r8 C compensate n for rounded down `up' 107 108 vxor v1, v1, v1 109 li r8, 0 C grand total count 110 111 vxor v3, v3, v3 C zero total count 112 113 addic. n, n, -LIMBS_PER_VR 114 ble L(sum) 115 116 addic. n, n, -LIMBS_PER_VR 117 ble L(lsum) 118 119C For 64-bit machines, handle huge n that would overflow vsum4ubs 120LIMB64(`ble cr7, L(small) ') 121LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n 122LIMB64(`lis n, LIMBS_PER_CHUNK ') 123L(small): 124 125 126LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 127LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 128 addi r7, r7, 1 129 mtctr r7 C copy n to count register 130 b L(ent) 131 132 ALIGN(8) 133L(top): lvx v0, 0, up 134 li r7, 128 C prefetch distance 135L(ent): lvx v1, r12, up 136 addi up, up, 32 137 vsr v4, v0, cnt1 138 vsr v5, v1, cnt1 139 dcbt up, r7 C prefetch 140 vand v8, v4, x01010101 141 vand v9, v5, x01010101 142 vsububm v0, v0, v8 C 64 2-bit accumulators (0..2) 143 vsububm v1, v1, v9 C 64 2-bit accumulators (0..2) 144 vsr v4, v0, cnt2 145 vsr v5, v1, cnt2 146 vand v8, v0, x00110011 147 vand v9, v1, x00110011 148 vand v4, v4, x00110011 149 vand v5, v5, x00110011 150 vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4) 151 vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4) 152 vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8) 153 vsr v9, v8, cnt4 154 vand v6, v8, x00001111 155 vand v9, v9, x00001111 156 vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16) 157 vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields 158 bdnz L(top) 159 160 andi. n, n, eval(LIMBS_PER_2VR-1) 161 beq L(rt) 162 163 lvx v0, 0, up 164 vxor v1, v1, v1 165 cmpwi n, LIMBS_PER_VR 166 ble L(sum) 167L(lsum): 168 vor v1, v0, v0 169 lvx v0, r12, up 170L(sum): 171LIMB32(`rlwinm r6, n, 4,26,27 ') 172LIMB64(`rlwinm r6, n, 5,26,26 ') 173 addi r7, r11, 32 174 lvx v8, r7, r6 175 vand v0, v0, v8 176 177 vsr v4, v0, cnt1 178 vsr v5, v1, cnt1 179 vand v8, v4, x01010101 180 vand v9, v5, x01010101 181 vsububm v0, v0, v8 C 64 2-bit accumulators (0..2) 182 vsububm v1, v1, v9 C 64 2-bit accumulators (0..2) 183 vsr v4, v0, cnt2 184 vsr v5, v1, cnt2 185 vand v8, v0, x00110011 186 vand v9, v1, x00110011 187 vand v4, v4, x00110011 188 vand v5, v5, x00110011 189 vaddubm v0, v4, v8 C 32 4-bit accumulators (0..4) 190 vaddubm v1, v5, v9 C 32 4-bit accumulators (0..4) 191 vaddubm v8, v0, v1 C 32 4-bit accumulators (0..8) 192 vsr v9, v8, cnt4 193 vand v6, v8, x00001111 194 vand v9, v9, x00001111 195 vaddubm v6, v9, v6 C 16 8-bit accumulators (0..16) 196 vsum4ubs v3, v6, v3 C sum 4 x 4 bytes into 4 32-bit fields 197 198L(rt): 199 li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs 200 stvx v3, r7, r1 C FIXME: ...support storing below sp? 201 202 lwz r7, -16(r1) 203 add r8, r8, r7 204 lwz r7, -12(r1) 205 add r8, r8, r7 206 lwz r7, -8(r1) 207 add r8, r8, r7 208 lwz r7, -4(r1) 209 add r8, r8, r7 210 211C Handle outer loop for huge n. We inherit cr7 and r0 from above. 212LIMB64(`ble cr7, L(ret) 213 vxor v3, v3, v3 C zero total count 214 mr n, r9 215 cmpd cr7, n, r0 216 ble cr7, L(2) 217 addis r9, n, -LIMBS_PER_CHUNK C remaining n 218 lis n, LIMBS_PER_CHUNK 219L(2): srdi r7, n, 2 C loop count corresponding to n 220 mtctr r7 C copy n to count register 221 b L(top) 222') 223 224L(ret): mr r3, r8 225 mtspr 256, r10 226 blr 227EPILOGUE() 228 229DEF_OBJECT(cnsts,16) 230 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 231 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55 232 233 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 234 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33 235C Masks for high end of number 236 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 237 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 238 239 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 240 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 241 242 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 243 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 244 245 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 246 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 247C Masks for low end of number 248 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 249 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 250 251 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 252 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 253 254 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 255 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 256 257 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 258 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 259END_OBJECT(cnsts) 260ASM_END() 261