1dnl ARM64 mpn_rshift. 2 3dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb assumed optimal c/l 34C Cortex-A53 3.5-4.0 3.25 35C Cortex-A57 2.0 2.0 36C X-Gene 2.67 2.5 37 38C TODO 39C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These 40C numbers should be 1 and 0, respectively. The str in wind-down should also 41C go. 42C * Using extr and with 63 separate loops we might reach 1.25 c/l on A57. 43C * A53's speed depends on alignment, but not as simply as for lshift/lshiftc. 44 45changecom(blah) 46 47define(`rp_arg', `x0') 48define(`up', `x1') 49define(`n', `x2') 50define(`cnt', `x3') 51 52define(`rp', `x16') 53 54define(`tnc',`x8') 55 56define(`PSHIFT', lsr) 57define(`NSHIFT', lsl) 58 59ASM_START() 60PROLOGUE(mpn_rshift) 61 mov rp, rp_arg 62 sub tnc, xzr, cnt 63 lsr x18, n, #2 64 tbz n, #0, L(bx0) 65 66L(bx1): ldr x5, [up] 67 tbnz n, #1, L(b11) 68 69L(b01): NSHIFT x0, x5, tnc 70 PSHIFT x2, x5, cnt 71 cbnz x18, L(gt1) 72 str x2, [rp] 73 ret 74L(gt1): ldp x4, x5, [up,#8] 75 sub up, up, #8 76 sub rp, rp, #32 77 b L(lo2) 78 79L(b11): NSHIFT x0, x5, tnc 80 PSHIFT x2, x5, cnt 81 ldp x6, x7, [up,#8]! 82 sub rp, rp, #16 83 b L(lo3) 84 85L(bx0): ldp x4, x5, [up] 86 tbz n, #1, L(b00) 87 88L(b10): NSHIFT x0, x4, tnc 89 PSHIFT x13, x4, cnt 90 NSHIFT x10, x5, tnc 91 PSHIFT x2, x5, cnt 92 cbnz x18, L(gt2) 93 orr x10, x10, x13 94 stp x10, x2, [rp] 95 ret 96L(gt2): ldp x4, x5, [up,#16] 97 orr x10, x10, x13 98 str x10, [rp],#-24 99 b L(lo2) 100 101L(b00): NSHIFT x0, x4, tnc 102 PSHIFT x13, x4, cnt 103 NSHIFT x10, x5, tnc 104 PSHIFT x2, x5, cnt 105 ldp x6, x7, [up,#16]! 106 orr x10, x10, x13 107 str x10, [rp],#-8 108 b L(lo0) 109 110 ALIGN(16) 111L(top): ldp x4, x5, [up,#16] 112 orr x10, x10, x13 113 orr x11, x12, x2 114 stp x11, x10, [rp,#16] 115 PSHIFT x2, x7, cnt 116L(lo2): NSHIFT x10, x5, tnc 117 NSHIFT x12, x4, tnc 118 PSHIFT x13, x4, cnt 119 ldp x6, x7, [up,#32]! 120 orr x10, x10, x13 121 orr x11, x12, x2 122 stp x11, x10, [rp,#32]! 123 PSHIFT x2, x5, cnt 124L(lo0): sub x18, x18, #1 125L(lo3): NSHIFT x10, x7, tnc 126 NSHIFT x12, x6, tnc 127 PSHIFT x13, x6, cnt 128 cbnz x18, L(top) 129 130L(end): orr x10, x10, x13 131 orr x11, x12, x2 132 PSHIFT x2, x7, cnt 133 stp x11, x10, [rp,#16] 134 str x2, [rp,#32] 135 ret 136EPILOGUE() 137