1dnl ARM64 Neon mpn_hamdist -- mpn bit hamming distance. 2 3dnl Copyright 2013, 2014 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C Cortex-A53 4.5 35C Cortex-A57 1.9 36C X-Gene 4.36 37 38C TODO 39C * Consider greater unrolling. 40C * Arrange to align the pointer, if that helps performance. Use the same 41C read-and-mask trick we use on PCs, for simplicity and performance. (Sorry 42C valgrind!) 43C * Explore if explicit align directives, e.g., "[ptr:128]" help. 44C * See rth's gmp-devel 2013-02/03 messages about final summation tricks. 45 46changecom(blah) 47 48C INPUT PARAMETERS 49define(`ap', x0) 50define(`bp', x1) 51define(`n', x2) 52 53C We sum into 16 16-bit counters in v4,v5, but at the end we sum them and end 54C up with 8 16-bit counters. Therefore, we can sum to 8(2^16-1) bits, or 55C (8*2^16-1)/64 = 0x1fff limbs. We use a chunksize close to that, but which 56C allows the huge count code to jump deep into the code (at L(chu)). 57 58define(`maxsize', 0x1fff) 59define(`chunksize',0x1ff0) 60 61ASM_START() 62PROLOGUE(mpn_hamdist) 63 64 mov x11, #maxsize 65 cmp n, x11 66 b.hi L(gt8k) 67 68L(lt8k): 69 movi v4.16b, #0 C clear summation register 70 movi v5.16b, #0 C clear summation register 71 72 tbz n, #0, L(xx0) 73 sub n, n, #1 74 ld1 {v0.1d}, [ap], #8 C load 1 limb 75 ld1 {v16.1d}, [bp], #8 C load 1 limb 76 eor v0.16b, v0.16b, v16.16b 77 cnt v6.16b, v0.16b 78 uadalp v4.8h, v6.16b C could also splat 79 80L(xx0): tbz n, #1, L(x00) 81 sub n, n, #2 82 ld1 {v0.2d}, [ap], #16 C load 2 limbs 83 ld1 {v16.2d}, [bp], #16 C load 2 limbs 84 eor v0.16b, v0.16b, v16.16b 85 cnt v6.16b, v0.16b 86 uadalp v4.8h, v6.16b 87 88L(x00): tbz n, #2, L(000) 89 subs n, n, #4 90 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 91 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 92 b.ls L(sum) 93 94L(gt4): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 95 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 96 eor v0.16b, v0.16b, v16.16b 97 eor v1.16b, v1.16b, v17.16b 98 sub n, n, #4 99 cnt v6.16b, v0.16b 100 cnt v7.16b, v1.16b 101 b L(mid) 102 103L(000): subs n, n, #8 104 b.lo L(e0) 105 106L(chu): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 107 ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 108 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 109 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 110 eor v2.16b, v2.16b, v18.16b 111 eor v3.16b, v3.16b, v19.16b 112 cnt v6.16b, v2.16b 113 cnt v7.16b, v3.16b 114 subs n, n, #8 115 b.lo L(end) 116 117L(top): ld1 {v2.2d,v3.2d}, [ap], #32 C load 4 limbs 118 ld1 {v18.2d,v19.2d}, [bp], #32 C load 4 limbs 119 eor v0.16b, v0.16b, v16.16b 120 eor v1.16b, v1.16b, v17.16b 121 uadalp v4.8h, v6.16b 122 cnt v6.16b, v0.16b 123 uadalp v5.8h, v7.16b 124 cnt v7.16b, v1.16b 125L(mid): ld1 {v0.2d,v1.2d}, [ap], #32 C load 4 limbs 126 ld1 {v16.2d,v17.2d}, [bp], #32 C load 4 limbs 127 eor v2.16b, v2.16b, v18.16b 128 eor v3.16b, v3.16b, v19.16b 129 subs n, n, #8 130 uadalp v4.8h, v6.16b 131 cnt v6.16b, v2.16b 132 uadalp v5.8h, v7.16b 133 cnt v7.16b, v3.16b 134 b.hs L(top) 135 136L(end): uadalp v4.8h, v6.16b 137 uadalp v5.8h, v7.16b 138L(sum): eor v0.16b, v0.16b, v16.16b 139 eor v1.16b, v1.16b, v17.16b 140 cnt v6.16b, v0.16b 141 cnt v7.16b, v1.16b 142 uadalp v4.8h, v6.16b 143 uadalp v5.8h, v7.16b 144 add v4.8h, v4.8h, v5.8h 145 C we have 8 16-bit counts 146L(e0): uaddlp v4.4s, v4.8h C we have 4 32-bit counts 147 uaddlp v4.2d, v4.4s C we have 2 64-bit counts 148 mov x0, v4.d[0] 149 mov x1, v4.d[1] 150 add x0, x0, x1 151 ret 152 153C Code for count > maxsize. Splits operand and calls above code. 154define(`ap2', x5) C caller-saves reg not used above 155define(`bp2', x6) C caller-saves reg not used above 156L(gt8k): 157 mov x8, x30 158 mov x7, n C full count (caller-saves reg not used above) 159 mov x4, #0 C total sum (caller-saves reg not used above) 160 mov x9, #chunksize*8 C caller-saves reg not used above 161 mov x10, #chunksize C caller-saves reg not used above 162 1631: add ap2, ap, x9 C point at subsequent block 164 add bp2, bp, x9 C point at subsequent block 165 mov n, #chunksize-8 C count for this invocation, adjusted for entry pt 166 movi v4.16b, #0 C clear chunk summation register 167 movi v5.16b, #0 C clear chunk summation register 168 bl L(chu) C jump deep inside code 169 add x4, x4, x0 170 mov ap, ap2 C put chunk pointer in place for calls 171 mov bp, bp2 C put chunk pointer in place for calls 172 sub x7, x7, x10 173 cmp x7, x11 174 b.hi 1b 175 176 mov n, x7 C count for final invocation 177 bl L(lt8k) 178 add x0, x4, x0 179 mov x30, x8 180 ret 181EPILOGUE() 182