1dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance. 2 3dnl Copyright 2013, 2014 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C Cortex-A53 ? 35C Cortex-A57 ? 36 37C TODO 38C * Consider greater unrolling. 39C * Arrange to align the pointer, if that helps performance. Use the same 40C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 41C valgrind!) 42C * Explore if explicit align directives, e.g., "[ptr:128]" help. 43C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 44 45changecom(@&*$) 46 47C INPUT PARAMETERS 48define(`ap', x0) 49define(`bp', x1) 50define(`n', x2) 51 52C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end 53C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 54C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which 55C allows the huge count code to jump deep into the code (at L(chu)). 56 57define(`maxsize', 0x1fff) 58define(`chunksize',0x1ff0) 59 60ASM_START() 61PROLOGUE(mpn_hamdist) 62 63 mov x11, #maxsize 64 cmp n, x11 65 b.hi L(gt8k) 66 67L(lt8k): 68 movi v4.16b, #0 C clear summation register 69 movi v5.16b, #0 C clear summation register 70 71 tbz n, #0, L(xx0) 72 sub n, n, #1 73 ld1 {v0.1d}, [ap], #8 C load 1 limb 74 ld1 {v16.1d}, [bp], #8 C load 1 limb 75 eor v0.16b, v0.16b, v16.16b 76 cnt v6.16b, v0.16b 77 uadalp v4.8h, v6.16b C could also splat 78 79L(xx0): tbz n, #1, L(x00) 80 sub n, n, #2 81 ld1 {v0.2d}, [ap], #16 C load 2 limbs 82 ld1 {v16.2d}, [bp], #16 C load 2 limbs 83 eor v0.16b, v0.16b, v16.16b 84 cnt v6.16b, v0.16b 85 uadalp v4.8h, v6.16b 86 87L(x00): tbz n, #2, L(000) 88 subs n, n, #4 89 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 90 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 91 b.ls L(sum) 92 93L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 94 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 95 eor v0.16b, v0.16b, v16.16b 96 eor v1.16b, v1.16b, v17.16b 97 sub n, n, #4 98 cnt v6.16b, v0.16b 99 cnt v7.16b, v1.16b 100 b L(mid) 101 102L(000): subs n, n, #8 103 b.lo L(e0) 104 105L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 106 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 107 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 108 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 109 eor v2.16b, v2.16b, v18.16b 110 eor v3.16b, v3.16b, v19.16b 111 cnt v6.16b, v2.16b 112 cnt v7.16b, v3.16b 113 subs n, n, #8 114 b.lo L(end) 115 116L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 117 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 118 eor v0.16b, v0.16b, v16.16b 119 eor v1.16b, v1.16b, v17.16b 120 uadalp v4.8h, v6.16b 121 cnt v6.16b, v0.16b 122 uadalp v5.8h, v7.16b 123 cnt v7.16b, v1.16b 124L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 125 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 126 eor v2.16b, v2.16b, v18.16b 127 eor v3.16b, v3.16b, v19.16b 128 subs n, n, #8 129 uadalp v4.8h, v6.16b 130 cnt v6.16b, v2.16b 131 uadalp v5.8h, v7.16b 132 cnt v7.16b, v3.16b 133 b.hs L(top) 134 135L(end): uadalp v4.8h, v6.16b 136 uadalp v5.8h, v7.16b 137L(sum): eor v0.16b, v0.16b, v16.16b 138 eor v1.16b, v1.16b, v17.16b 139 cnt v6.16b, v0.16b 140 cnt v7.16b, v1.16b 141 uadalp v4.8h, v6.16b 142 uadalp v5.8h, v7.16b 143 add v4.8h, v4.8h, v5.8h 144 C we have 8 16-bit counts 145L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts 146 uaddlp v4.2d, v4.4s C we have 2 64-bit counts 147 mov x0, v4.d[0] 148 mov x1, v4.d[1] 149 add x0, x0, x1 150 ret 151 152C Code for count > maxsize. Splits operand and calls above code. 153define(`ap2', x5) C caller-saves reg not used above 154define(`bp2', x6) C caller-saves reg not used above 155L(gt8k): 156 mov x8, x30 157 mov x7, n C full count (caller-saves reg not used above) 158 mov x4, #0 C total sum (caller-saves reg not used above) 159 mov x9, #chunksize*8 C caller-saves reg not used above 160 mov x10, #chunksize C caller-saves reg not used above 161 1621: add ap2, ap, x9 C point at subsequent block 163 add bp2, bp, x9 C point at subsequent block 164 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt 165 movi v4.16b, #0 C clear chunk summation register 166 movi v5.16b, #0 C clear chunk summation register 167 bl L(chu) C jump deep inside code 168 add x4, x4, x0 169 mov ap, ap2 C put chunk pointer in place for calls 170 mov bp, bp2 C put chunk pointer in place for calls 171 sub x7, x7, x10 172 cmp x7, x11 173 b.hi 1b 174 175 mov n, x7 C count for final invocation 176 bl L(lt8k) 177 add x0, x4, x0 178 mov x30, x8 179 ret 180EPILOGUE() 181