1dnl ARM Neon mpn_lshiftc. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb cycles/limb cycles/limb good 36C aligned unaligned best seen for cpu? 37C StrongARM - - 38C XScale - - 39C Cortex-A7 ? ? 40C Cortex-A8 ? ? 41C Cortex-A9 3.5 3.5 Y 42C Cortex-A15 1.75 1.75 Y 43 44 45C We read 64 bits at a time at 32-bit aligned addresses, and except for the 46C first and last store, we write using 64-bit aligned addresses. All shifting 47C is done on 64-bit words in 'extension' registers. 48C 49C It should be possible to read also using 64-bit alignment, by manipulating 50C the shift count for unaligned operands. Not done, since it does not seem to 51C matter for A9 or A15. 52C 53C This will not work in big-endian mode. 54 55C TODO 56C * Try using 128-bit operations. Note that Neon lacks pure 128-bit shifts, 57C which might make it tricky. 58C * Clean up and simplify. 59C * Consider sharing most of the code for lshift and rshift, since the feed-in 60C code, the loop, and most of the wind-down code are identical. 61C * Replace the basecase code with code using 'extension' registers. 62C * Optimise. It is not clear that this loop insn permutation is optimal for 63C either A9 or A15. 64 65C INPUT PARAMETERS 66define(`rp', `r0') 67define(`ap', `r1') 68define(`n', `r2') 69define(`cnt', `r3') 70 71ASM_START(neon) 72 TEXT 73 ALIGN(64) 74PROLOGUE(mpn_lshiftc) 75 mov r12, n, lsl #2 76 add rp, rp, r12 77 add ap, ap, r12 78 79 cmp n, #4 C SIMD code n limit 80 ble L(base) 81 82 vdup.32 d6, r3 C left shift count is positive 83 sub r3, r3, #64 C right shift count is negative 84 vdup.32 d7, r3 85 mov r12, #-8 C lshift pointer update offset 86 87 sub ap, ap, #8 88 vld1.32 {d19}, [ap], r12 C load initial 2 limbs 89 vshl.u64 d18, d19, d7 C retval 90 91 tst rp, #4 C is rp 64-bit aligned already? 92 beq L(rp_aligned) C yes, skip 93 vmvn d19, d19 94 add ap, ap, #4 C move back ap pointer 95 vshl.u64 d4, d19, d6 96 sub n, n, #1 C first limb handled 97 sub rp, rp, #4 98 vst1.32 {d4[1]}, [rp] C store first limb, rp gets aligned 99 vld1.32 {d19}, [ap], r12 C load ap[1] and ap[2] 100 101L(rp_aligned): 102 sub rp, rp, #8 103 subs n, n, #6 104 vmvn d19, d19 105 blt L(two_or_three_more) 106 tst n, #2 107 beq L(2) 108 109L(1): vld1.32 {d17}, [ap], r12 110 vshl.u64 d5, d19, d6 111 vmvn d17, d17 112 vld1.32 {d16}, [ap], r12 113 vshl.u64 d0, d17, d7 114 vshl.u64 d4, d17, d6 115 sub n, n, #2 116 b L(mid) 117 118L(2): vld1.32 {d16}, [ap], r12 119 vshl.u64 d4, d19, d6 120 vmvn d16, d16 121 vld1.32 {d17}, [ap], r12 122 vshl.u64 d1, d16, d7 123 vshl.u64 d5, d16, d6 124 subs n, n, #4 125 blt L(end) 126 127L(top): vmvn d17, d17 128 vld1.32 {d16}, [ap], r12 129 vorr d2, d4, d1 130 vshl.u64 d0, d17, d7 131 vshl.u64 d4, d17, d6 132 vst1.32 {d2}, [rp:64], r12 133L(mid): vmvn d16, d16 134 vld1.32 {d17}, [ap], r12 135 vorr d3, d5, d0 136 vshl.u64 d1, d16, d7 137 vshl.u64 d5, d16, d6 138 vst1.32 {d3}, [rp:64], r12 139 subs n, n, #4 140 bge L(top) 141 142L(end): tst n, #1 143 beq L(evn) 144 145 vorr d2, d4, d1 146 vst1.32 {d2}, [rp:64], r12 147 b L(cj1) 148 149L(evn): vmvn d17, d17 150 vorr d2, d4, d1 151 vshl.u64 d0, d17, d7 152 vshl.u64 d4, d17, d6 153 vst1.32 {d2}, [rp:64], r12 154 vmov.u8 d17, #255 155 vorr d2, d5, d0 156 vshl.u64 d0, d17, d7 157 vorr d3, d4, d0 158 b L(cj2) 159 160C Load last 2 - 3 limbs, store last 4 - 5 limbs 161L(two_or_three_more): 162 tst n, #1 163 beq L(l2) 164 165L(l3): vshl.u64 d5, d19, d6 166 vld1.32 {d17}, [ap], r12 167L(cj1): vmov.u8 d16, #0 168 add ap, ap, #4 169 vmvn d17, d17 170 vld1.32 {d16[1]}, [ap], r12 171 vshl.u64 d0, d17, d7 172 vshl.u64 d4, d17, d6 173 vmvn d16, d16 174 vorr d3, d5, d0 175 vshl.u64 d1, d16, d7 176 vshl.u64 d5, d16, d6 177 vst1.32 {d3}, [rp:64], r12 178 vorr d2, d4, d1 179 vst1.32 {d2}, [rp:64], r12 180 add rp, rp, #4 181 vst1.32 {d5[1]}, [rp] 182 vmov.32 r0, d18[0] 183 bx lr 184 185L(l2): vld1.32 {d16}, [ap], r12 186 vshl.u64 d4, d19, d6 187 vmvn d16, d16 188 vshl.u64 d1, d16, d7 189 vshl.u64 d5, d16, d6 190 vmov.u8 d17, #255 191 vorr d2, d4, d1 192 vshl.u64 d0, d17, d7 193 vorr d3, d5, d0 194L(cj2): vst1.32 {d2}, [rp:64], r12 195 vst1.32 {d3}, [rp] 196 vmov.32 r0, d18[0] 197 bx lr 198 199 200define(`tnc', `r12') 201L(base): 202 push {r4, r6, r7, r8} 203 ldr r4, [ap, #-4]! 204 rsb tnc, cnt, #32 205 mvn r6, r4 206 207 mov r7, r6, lsl cnt 208 tst n, #1 209 beq L(ev) C n even 210 211L(od): subs n, n, #2 212 bcc L(ed1) C n = 1 213 ldr r8, [ap, #-4]! 214 mvn r8, r8 215 b L(md) C n = 3 216 217L(ev): ldr r6, [ap, #-4]! 218 mvn r6, r6 219 subs n, n, #2 220 beq L(ed) C n = 3 221 C n = 4 222L(tp): ldr r8, [ap, #-4]! 223 orr r7, r7, r6, lsr tnc 224 str r7, [rp, #-4]! 225 mvn r8, r8 226 mov r7, r6, lsl cnt 227L(md): ldr r6, [ap, #-4]! 228 orr r7, r7, r8, lsr tnc 229 str r7, [rp, #-4]! 230 mvn r6, r6 231 mov r7, r8, lsl cnt 232 233L(ed): orr r7, r7, r6, lsr tnc 234 str r7, [rp, #-4]! 235 mov r7, r6, lsl cnt 236L(ed1): mvn r6, #0 237 orr r7, r7, r6, lsr tnc 238 str r7, [rp, #-4] 239 mov r0, r4, lsr tnc 240 pop {r4, r6, r7, r8} 241 bx r14 242EPILOGUE() 243