1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_popcount. 2 3dnl Copyright 2006, 2010 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C 7400,7410 (G4): ? 24C 744x,745x (G4+): 1.125 25C 970 (G5): 2.25 26 27C TODO 28C * Rewrite the awkward huge n outer loop code. 29C * Two lvx, two vperm, and two vxor could make us a similar hamdist. 30C * Compress cnsts table in 64-bit mode, only half the values are needed. 31 32define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 33define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 34define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 35 36define(`OPERATION_popcount') 37 38define(`ap', `r3') 39define(`n', `r4') 40 41define(`rtab', `v10') 42define(`cnt4', `v11') 43 44ifelse(GMP_LIMB_BITS,32,` 45 define(`LIMB32',` $1') 46 define(`LIMB64',`') 47',` 48 define(`LIMB32',`') 49 define(`LIMB64',` $1') 50') 51 52C The inner loop handles up to 2^34 bits, i.e., 2^31 64-limbs, due to overflow 53C in vsum4ubs. For large operands, we work in chunks, of size LIMBS_PER_CHUNK. 54define(`LIMBS_PER_CHUNK', 0x1000) 55define(`LIMBS_CHUNK_THRES', 0x1001) 56 57ASM_START() 58PROLOGUE(mpn_popcount) 59 mfspr r10, 256 60 oris r0, r10, 0xfffc C Set VRSAVE bit 0-13 61 mtspr 256, r0 62 63ifdef(`HAVE_ABI_mode32', 64` rldicl n, n, 0, 32') C zero extend n 65 66C Load various constants into vector registers 67 LEAL( r11, cnsts) 68 li r12, 16 69 vspltisb cnt4, 4 C 0x0404...04 used as shift count 70 71 li r7, 160 72 lvx rtab, 0, r11 73 74LIMB64(`lis r0, LIMBS_CHUNK_THRES ') 75LIMB64(`cmpd cr7, n, r0 ') 76 77 lvx v0, 0, ap 78 addi r7, r11, 80 79 rlwinm r6, ap, 2,26,29 80 lvx v8, r7, r6 81 vand v0, v0, v8 82 83LIMB32(`rlwinm r8, ap, 30,30,31 ') 84LIMB64(`rlwinm r8, ap, 29,31,31 ') 85 add n, n, r8 C compensate n for rounded down `ap' 86 87 vxor v1, v1, v1 88 li r8, 0 C grand total count 89 90 vxor v12, v12, v12 C zero total count 91 vxor v13, v13, v13 C zero total count 92 93 addic. n, n, -LIMBS_PER_VR 94 ble L(sum) 95 96 addic. n, n, -LIMBS_PER_VR 97 ble L(lsum) 98 99C For 64-bit machines, handle huge n that would overflow vsum4ubs 100LIMB64(`ble cr7, L(small) ') 101LIMB64(`addis r9, n, -LIMBS_PER_CHUNK ') C remaining n 102LIMB64(`lis n, LIMBS_PER_CHUNK ') 103 104 ALIGN(16) 105L(small): 106LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 107LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 108 addi r7, r7, 1 109 mtctr r7 C copy n to count register 110 b L(ent) 111 112 ALIGN(16) 113L(top): 114 lvx v0, 0, ap 115L(ent): lvx v1, r12, ap 116 addi ap, ap, 32 117 vsrb v8, v0, cnt4 118 vsrb v9, v1, cnt4 119 vperm v2, rtab, rtab, v0 120 vperm v3, rtab, rtab, v8 121 vperm v4, rtab, rtab, v1 122 vperm v5, rtab, rtab, v9 123 vaddubm v6, v2, v3 124 vaddubm v7, v4, v5 125 vsum4ubs v12, v6, v12 126 vsum4ubs v13, v7, v13 127 bdnz L(top) 128 129 andi. n, n, eval(LIMBS_PER_2VR-1) 130 beq L(rt) 131 132 lvx v0, 0, ap 133 vxor v1, v1, v1 134 cmpwi n, LIMBS_PER_VR 135 ble L(sum) 136L(lsum): 137 vor v1, v0, v0 138 lvx v0, r12, ap 139L(sum): 140LIMB32(`rlwinm r6, n, 4,26,27 ') 141LIMB64(`rlwinm r6, n, 5,26,26 ') 142 addi r7, r11, 16 143 lvx v8, r7, r6 144 vand v0, v0, v8 145 vsrb v8, v0, cnt4 146 vsrb v9, v1, cnt4 147 vperm v2, rtab, rtab, v0 148 vperm v3, rtab, rtab, v8 149 vperm v4, rtab, rtab, v1 150 vperm v5, rtab, rtab, v9 151 vaddubm v6, v2, v3 152 vaddubm v7, v4, v5 153 vsum4ubs v12, v6, v12 154 vsum4ubs v13, v7, v13 155 156 ALIGN(16) 157L(rt): vadduwm v3, v12, v13 158 li r7, -16 C FIXME: does all ppc32 and ppc64 ABIs 159 stvx v3, r7, r1 C FIXME: ...support storing below sp? 160 161 lwz r7, -16(r1) 162 add r8, r8, r7 163 lwz r7, -12(r1) 164 add r8, r8, r7 165 lwz r7, -8(r1) 166 add r8, r8, r7 167 lwz r7, -4(r1) 168 add r8, r8, r7 169 170C Handle outer loop for huge n. We inherit cr7 and r0 from above. 171LIMB64(`ble cr7, L(ret) 172 vxor v12, v12, v12 C zero total count 173 vxor v13, v13, v13 C zero total count 174 mr n, r9 175 cmpd cr7, n, r0 176 ble cr7, L(2) 177 addis r9, n, -LIMBS_PER_CHUNK C remaining n 178 lis n, LIMBS_PER_CHUNK 179L(2): srdi r7, n, 2 C loop count corresponding to n 180 mtctr r7 C copy n to count register 181 b L(top) 182') 183 184 ALIGN(16) 185L(ret): mr r3, r8 186 mtspr 256, r10 187 blr 188EPILOGUE() 189 190DEF_OBJECT(cnsts,16) 191C Counts for vperm 192 .byte 0x00,0x01,0x01,0x02,0x01,0x02,0x02,0x03 193 .byte 0x01,0x02,0x02,0x03,0x02,0x03,0x03,0x04 194C Masks for high end of number 195 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 196 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 197 198 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 199 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 200 201 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 202 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 203 204 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 205 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 206C Masks for low end of number 207 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 208 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 209 210 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 211 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 212 213 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 214 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff 215 216 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 217 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff 218END_OBJECT(cnsts) 219ASM_END() 220