1dnl ARM64 mpn_lshift. 2 3dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb assumed optimal c/l 34C Cortex-A53 3.5-4.0 3.25 35C Cortex-A57 2.0 2.0 36C X-Gene 2.67 2.5 37 38C TODO 39C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These 40C numbers should be 1 and 0, respectively. The str in wind-down should also 41C go. 42C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. 43C * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0. 44 45changecom(blah) 46 47define(`rp_arg', `x0') 48define(`up', `x1') 49define(`n', `x2') 50define(`cnt', `x3') 51 52define(`rp', `x16') 53 54define(`tnc',`x8') 55 56define(`PSHIFT', lsl) 57define(`NSHIFT', lsr) 58 59ASM_START() 60PROLOGUE(mpn_lshift) 61 add rp, rp_arg, n, lsl #3 62 add up, up, n, lsl #3 63 sub tnc, xzr, cnt 64 lsr x18, n, #2 65 tbz n, #0, L(bx0) 66 67L(bx1): ldr x4, [up,#-8] 68 tbnz n, #1, L(b11) 69 70L(b01): NSHIFT x0, x4, tnc 71 PSHIFT x2, x4, cnt 72 cbnz x18, L(gt1) 73 str x2, [rp,#-8] 74 ret 75L(gt1): ldp x4, x5, [up,#-24] 76 sub up, up, #8 77 add rp, rp, #16 78 b L(lo2) 79 80L(b11): NSHIFT x0, x4, tnc 81 PSHIFT x2, x4, cnt 82 ldp x6, x7, [up,#-24]! 83 b L(lo3) 84 85L(bx0): ldp x4, x5, [up,#-16] 86 tbz n, #1, L(b00) 87 88L(b10): NSHIFT x0, x5, tnc 89 PSHIFT x13, x5, cnt 90 NSHIFT x10, x4, tnc 91 PSHIFT x2, x4, cnt 92 cbnz x18, L(gt2) 93 orr x10, x10, x13 94 stp x2, x10, [rp,#-16] 95 ret 96L(gt2): ldp x4, x5, [up,#-32] 97 orr x10, x10, x13 98 str x10, [rp,#-8] 99 sub up, up, #16 100 add rp, rp, #8 101 b L(lo2) 102 103L(b00): NSHIFT x0, x5, tnc 104 PSHIFT x13, x5, cnt 105 NSHIFT x10, x4, tnc 106 PSHIFT x2, x4, cnt 107 ldp x6, x7, [up,#-32]! 108 orr x10, x10, x13 109 str x10, [rp,#-8]! 110 b L(lo0) 111 112 ALIGN(16) 113L(top): ldp x4, x5, [up,#-16] 114 orr x10, x10, x13 115 orr x11, x12, x2 116 stp x10, x11, [rp,#-16] 117 PSHIFT x2, x6, cnt 118L(lo2): NSHIFT x10, x4, tnc 119 PSHIFT x13, x5, cnt 120 NSHIFT x12, x5, tnc 121 ldp x6, x7, [up,#-32]! 122 orr x10, x10, x13 123 orr x11, x12, x2 124 stp x10, x11, [rp,#-32]! 125 PSHIFT x2, x4, cnt 126L(lo0): sub x18, x18, #1 127L(lo3): NSHIFT x10, x6, tnc 128 PSHIFT x13, x7, cnt 129 NSHIFT x12, x7, tnc 130 cbnz x18, L(top) 131 132L(end): orr x10, x10, x13 133 orr x11, x12, x2 134 PSHIFT x2, x6, cnt 135 stp x10, x11, [rp,#-16] 136 str x2, [rp,#-24] 137 ret 138EPILOGUE() 139