1dnl ARM64 Neon mpn_sec_tabselect. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2011-2014 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb 37C Cortex-A53 ? 38C Cortex-A57 ? 39 40C void 41C mpn_sec_tabselect (mp_ptr rp, mp_srcptr *tab, 42C mp_size_t n, mp_size_t nents, mp_size_t which) 43 44changecom(@&*$) 45 46define(`rp', `x0') 47define(`tp', `x1') 48define(`n', `x2') 49define(`nents', `x3') 50define(`which', `x4') 51 52define(`i', `x5') 53define(`j', `x6') 54 55define(`maskq', `v4') 56 57ASM_START() 58PROLOGUE(mpn_sec_tabselect) 59 dup v7.2d, x4 C 2 `which' copies 60 61 mov x10, #1 62 dup v6.2d, x10 C 2 copies of 1 63 64 subs j, n, #4 65 b.mi L(outer_end) 66 67L(outer_top): 68 mov i, nents 69 mov x12, tp C preserve tp 70 movi v5.16b, #0 C zero 2 counter copies 71 movi v2.16b, #0 72 movi v3.16b, #0 73 ALIGN(16) 74L(tp4): cmeq maskq.2d, v5.2d, v7.2d C compare idx copies to `which' copies 75 ld1 {v0.2d,v1.2d}, [tp] 76 add v5.2d, v5.2d, v6.2d 77 bit v2.16b, v0.16b, maskq.16b 78 bit v3.16b, v1.16b, maskq.16b 79 add tp, tp, n, lsl #3 80 sub i, i, #1 81 cbnz i, L(tp4) 82 st1 {v2.2d,v3.2d}, [rp], #32 83 add tp, x12, #32 C restore tp, point to next slice 84 subs j, j, #4 85 b.pl L(outer_top) 86L(outer_end): 87 88 tbz n, #1, L(b0x) 89 mov i, nents 90 mov x12, tp 91 movi v5.16b, #0 C zero 2 counter copies 92 movi v2.16b, #0 93 ALIGN(16) 94L(tp2): cmeq maskq.2d, v5.2d, v7.2d 95 ld1 {v0.2d}, [tp] 96 add v5.2d, v5.2d, v6.2d 97 bit v2.16b, v0.16b, maskq.16b 98 add tp, tp, n, lsl #3 99 sub i, i, #1 100 cbnz i, L(tp2) 101 st1 {v2.2d}, [rp], #16 102 add tp, x12, #16 103 104L(b0x): tbz n, #0, L(b00) 105 mov i, nents 106 mov x12, tp 107 movi v5.16b, #0 C zero 2 counter copies 108 movi v2.16b, #0 109 ALIGN(16) 110L(tp1): cmeq maskq.2d, v5.2d, v7.2d 111 ld1 {v0.1d}, [tp] 112 add v5.2d, v5.2d, v6.2d C FIXME size should be `1d' 113 bit v2.8b, v0.8b, maskq.8b 114 add tp, tp, n, lsl #3 115 sub i, i, #1 116 cbnz i, L(tp1) 117 st1 {v2.1d}, [rp], #8 118 add tp, x12, #8 119 120L(b00): ret 121EPILOGUE() 122