1dnl ARM64 Neon mpn_popcount -- mpn bit population count. 2 3dnl Copyright 2013, 2014 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C Cortex-A53 ? 35C Cortex-A57 ? 36 37C TODO 38C * Consider greater unrolling. 39C * Arrange to align the pointer, if that helps performance. Use the same 40C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 41C valgrind!) 42C * Explore if explicit align directives, e.g., "[ptr:128]" help. 43C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 44 45changecom(@&*$) 46 47C INPUT PARAMETERS 48define(`ap', x0) 49define(`n', x1) 50 51C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end 52C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 53C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which 54C allows the huge count code to jump deep into the code (at L(chu)). 55 56define(`maxsize', 0x1fff) 57define(`chunksize',0x1ff0) 58 59ASM_START() 60PROLOGUE(mpn_popcount) 61 62 mov x11, #maxsize 63 cmp n, x11 64 b.hi L(gt8k) 65 66L(lt8k): 67 movi v4.16b, #0 C clear summation register 68 movi v5.16b, #0 C clear summation register 69 70 tbz n, #0, L(xx0) 71 sub n, n, #1 72 ld1 {v0.1d}, [ap], #8 C load 1 limb 73 cnt v6.16b, v0.16b 74 uadalp v4.8h, v6.16b C could also splat 75 76L(xx0): tbz n, #1, L(x00) 77 sub n, n, #2 78 ld1 {v0.2d}, [ap], #16 C load 2 limbs 79 cnt v6.16b, v0.16b 80 uadalp v4.8h, v6.16b 81 82L(x00): tbz n, #2, L(000) 83 subs n, n, #4 84 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 85 b.ls L(sum) 86 87L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 88 sub n, n, #4 89 cnt v6.16b, v0.16b 90 cnt v7.16b, v1.16b 91 b L(mid) 92 93L(000): subs n, n, #8 94 b.lo L(e0) 95 96L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 97 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 98 cnt v6.16b, v2.16b 99 cnt v7.16b, v3.16b 100 subs n, n, #8 101 b.lo L(end) 102 103L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 104 uadalp v4.8h, v6.16b 105 cnt v6.16b, v0.16b 106 uadalp v5.8h, v7.16b 107 cnt v7.16b, v1.16b 108L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 109 subs n, n, #8 110 uadalp v4.8h, v6.16b 111 cnt v6.16b, v2.16b 112 uadalp v5.8h, v7.16b 113 cnt v7.16b, v3.16b 114 b.hs L(top) 115 116L(end): uadalp v4.8h, v6.16b 117 uadalp v5.8h, v7.16b 118L(sum): cnt v6.16b, v0.16b 119 cnt v7.16b, v1.16b 120 uadalp v4.8h, v6.16b 121 uadalp v5.8h, v7.16b 122 add v4.8h, v4.8h, v5.8h 123 C we have 8 16-bit counts 124L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts 125 uaddlp v4.2d, v4.4s C we have 2 64-bit counts 126 mov x0, v4.d[0] 127 mov x1, v4.d[1] 128 add x0, x0, x1 129 ret 130 131C Code for count > maxsize. Splits operand and calls above code. 132define(`ap2', x5) C caller-saves reg not used above 133L(gt8k): 134 mov x8, x30 135 mov x7, n C full count (caller-saves reg not used above) 136 mov x4, #0 C total sum (caller-saves reg not used above) 137 mov x9, #chunksize*8 C caller-saves reg not used above 138 mov x10, #chunksize C caller-saves reg not used above 139 1401: add ap2, ap, x9 C point at subsequent block 141 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt 142 movi v4.16b, #0 C clear chunk summation register 143 movi v5.16b, #0 C clear chunk summation register 144 bl L(chu) C jump deep inside code 145 add x4, x4, x0 146 mov ap, ap2 C put chunk pointer in place for calls 147 sub x7, x7, x10 148 cmp x7, x11 149 b.hi 1b 150 151 mov n, x7 C count for final invocation 152 bl L(lt8k) 153 add x0, x4, x0 154 mov x30, x8 155 ret 156EPILOGUE() 157