1dnl ARM64 Neon mpn_sec_tabselect. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2011-2014 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb 37C Cortex-A53 2.25 38C Cortex-A57 1.33 39C X-Gene 2 40 41C void 42C mpn_sec_tabselect (mp_ptr rp, mp_srcptr *tab, 43C mp_size_t n, mp_size_t nents, mp_size_t which) 44 45changecom(blah) 46 47define(`rp', `x0') 48define(`tp', `x1') 49define(`n', `x2') 50define(`nents', `x3') 51define(`which', `x4') 52 53define(`i', `x5') 54define(`j', `x6') 55 56define(`maskq', `v4') 57 58ASM_START() 59PROLOGUE(mpn_sec_tabselect) 60 dup v7.2d, x4 C 2 `which' copies 61 62 mov x10, #1 63 dup v6.2d, x10 C 2 copies of 1 64 65 subs j, n, #4 66 b.mi L(outer_end) 67 68L(outer_top): 69 mov i, nents 70 mov x12, tp C preserve tp 71 movi v5.16b, #0 C zero 2 counter copies 72 movi v2.16b, #0 73 movi v3.16b, #0 74 ALIGN(16) 75L(tp4): cmeq maskq.2d, v5.2d, v7.2d C compare idx copies to `which' copies 76 ld1 {v0.2d,v1.2d}, [tp] 77 add v5.2d, v5.2d, v6.2d 78 bit v2.16b, v0.16b, maskq.16b 79 bit v3.16b, v1.16b, maskq.16b 80 add tp, tp, n, lsl #3 81 sub i, i, #1 82 cbnz i, L(tp4) 83 st1 {v2.2d,v3.2d}, [rp], #32 84 add tp, x12, #32 C restore tp, point to next slice 85 subs j, j, #4 86 b.pl L(outer_top) 87L(outer_end): 88 89 tbz n, #1, L(b0x) 90 mov i, nents 91 mov x12, tp 92 movi v5.16b, #0 C zero 2 counter copies 93 movi v2.16b, #0 94 ALIGN(16) 95L(tp2): cmeq maskq.2d, v5.2d, v7.2d 96 ld1 {v0.2d}, [tp] 97 add v5.2d, v5.2d, v6.2d 98 bit v2.16b, v0.16b, maskq.16b 99 add tp, tp, n, lsl #3 100 sub i, i, #1 101 cbnz i, L(tp2) 102 st1 {v2.2d}, [rp], #16 103 add tp, x12, #16 104 105L(b0x): tbz n, #0, L(b00) 106 mov i, nents 107 mov x12, tp 108 movi v5.16b, #0 C zero 2 counter copies 109 movi v2.16b, #0 110 ALIGN(16) 111L(tp1): cmeq maskq.2d, v5.2d, v7.2d 112 ld1 {v0.1d}, [tp] 113 add v5.2d, v5.2d, v6.2d C FIXME size should be `1d' 114 bit v2.8b, v0.8b, maskq.8b 115 add tp, tp, n, lsl #3 116 sub i, i, #1 117 cbnz i, L(tp1) 118 st1 {v2.1d}, [rp], #8 119 add tp, x12, #8 120 121L(b00): ret 122EPILOGUE() 123