1dnl ARM Neon mpn_sec_tabselect. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb 37C StrongARM - 38C XScale - 39C Cortex-A7 ? 40C Cortex-A8 ? 41C Cortex-A9 1.15 42C Cortex-A15 0.65 43 44define(`rp', `r0') 45define(`tp', `r1') 46define(`n', `r2') 47define(`nents', `r3') 48C define(`which', on stack) 49 50define(`i', `r4') 51define(`j', `r5') 52 53define(`maskq', `q10') 54define(`maskd', `d20') 55 56ASM_START() 57PROLOGUE(mpn_sec_tabselect) 58 push {r4-r5} 59 60 add r4, sp, #8 61 vld1.32 {d30[], d31[]}, [r4] C 4 `which' copies 62 vmov.i32 q14, #1 C 4 copies of 1 63 64 subs j, n, #8 65 bmi L(outer_end) 66 67L(outer_top): 68 mov i, nents 69 mov r12, tp C preserve tp 70 veor q13, q13, q13 C 4 counter copies 71 veor q2, q2, q2 72 veor q3, q3, q3 73 ALIGN(16) 74L(top): vceq.i32 maskq, q13, q15 C compare idx copies to `which' copies 75 vld1.32 {q0,q1}, [tp] 76 vadd.i32 q13, q13, q14 77 vbit q2, q0, maskq 78 vbit q3, q1, maskq 79 add tp, tp, n, lsl #2 80 subs i, i, #1 81 bne L(top) 82 vst1.32 {q2,q3}, [rp]! 83 add tp, r12, #32 C restore tp, point to next slice 84 subs j, j, #8 85 bpl L(outer_top) 86L(outer_end): 87 88 tst n, #4 89 beq L(b0xx) 90L(b1xx):mov i, nents 91 mov r12, tp 92 veor q13, q13, q13 93 veor q2, q2, q2 94 ALIGN(16) 95L(tp4): vceq.i32 maskq, q13, q15 96 vld1.32 {q0}, [tp] 97 vadd.i32 q13, q13, q14 98 vbit q2, q0, maskq 99 add tp, tp, n, lsl #2 100 subs i, i, #1 101 bne L(tp4) 102 vst1.32 {q2}, [rp]! 103 add tp, r12, #16 104 105L(b0xx):tst n, #2 106 beq L(b00x) 107L(b01x):mov i, nents 108 mov r12, tp 109 veor d26, d26, d26 110 veor d4, d4, d4 111 ALIGN(16) 112L(tp2): vceq.i32 maskd, d26, d30 113 vld1.32 {d0}, [tp] 114 vadd.i32 d26, d26, d28 115 vbit d4, d0, maskd 116 add tp, tp, n, lsl #2 117 subs i, i, #1 118 bne L(tp2) 119 vst1.32 {d4}, [rp]! 120 add tp, r12, #8 121 122L(b00x):tst n, #1 123 beq L(b000) 124L(b001):mov i, nents 125 mov r12, tp 126 veor d26, d26, d26 127 veor d4, d4, d4 128 ALIGN(16) 129L(tp1): vceq.i32 maskd, d26, d30 130 vld1.32 {d0[0]}, [tp] 131 vadd.i32 d26, d26, d28 132 vbit d4, d0, maskd 133 add tp, tp, n, lsl #2 134 subs i, i, #1 135 bne L(tp1) 136 vst1.32 {d4[0]}, [rp] 137 138L(b000):pop {r4-r5} 139 bx r14 140EPILOGUE() 141