1dnl ARM64 mpn_lshiftc. 2 3dnl Copyright 2013, 2014, 2017 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb assumed optimal c/l 34C Cortex-A53 3.5-4.0 3.25 35C Cortex-A57 2.0 2.0 36C X-Gene 2.67 2.5 37 38C TODO 39C * The feed-in code used 1 ldr for odd sized and 2 ldr for even sizes. These 40C numbers should be 1 and 0, respectively. The str in wind-down should also 41C go. 42C * Using extr and with 63 separate loops we might reach 1.5 c/l on A57. 43C * A53's speed depends on alignment, tune/speed -w1 gives 3.5, -w0 gives 4.0. 44 45changecom(blah) 46 47define(`rp_arg', `x0') 48define(`up', `x1') 49define(`n', `x2') 50define(`cnt', `x3') 51 52define(`rp', `x16') 53 54define(`tnc',`x8') 55 56define(`PSHIFT', lsl) 57define(`NSHIFT', lsr) 58 59ASM_START() 60PROLOGUE(mpn_lshiftc) 61 add rp, rp_arg, n, lsl #3 62 add up, up, n, lsl #3 63 sub tnc, xzr, cnt 64 lsr x18, n, #2 65 tbz n, #0, L(bx0) 66 67L(bx1): ldr x4, [up,#-8] 68 tbnz n, #1, L(b11) 69 70L(b01): NSHIFT x0, x4, tnc 71 PSHIFT x2, x4, cnt 72 cbnz x18, L(gt1) 73 mvn x2, x2 74 str x2, [rp,#-8] 75 ret 76L(gt1): ldp x4, x5, [up,#-24] 77 sub up, up, #8 78 add rp, rp, #16 79 b L(lo2) 80 81L(b11): NSHIFT x0, x4, tnc 82 PSHIFT x2, x4, cnt 83 ldp x6, x7, [up,#-24]! 84 b L(lo3) 85 86L(bx0): ldp x4, x5, [up,#-16] 87 tbz n, #1, L(b00) 88 89L(b10): NSHIFT x0, x5, tnc 90 PSHIFT x13, x5, cnt 91 NSHIFT x10, x4, tnc 92 PSHIFT x2, x4, cnt 93 cbnz x18, L(gt2) 94 eon x10, x10, x13 95 mvn x2, x2 96 stp x2, x10, [rp,#-16] 97 ret 98L(gt2): ldp x4, x5, [up,#-32] 99 eon x10, x10, x13 100 str x10, [rp,#-8] 101 sub up, up, #16 102 add rp, rp, #8 103 b L(lo2) 104 105L(b00): NSHIFT x0, x5, tnc 106 PSHIFT x13, x5, cnt 107 NSHIFT x10, x4, tnc 108 PSHIFT x2, x4, cnt 109 ldp x6, x7, [up,#-32]! 110 eon x10, x10, x13 111 str x10, [rp,#-8]! 112 b L(lo0) 113 114 ALIGN(16) 115L(top): ldp x4, x5, [up,#-16] 116 eon x10, x10, x13 117 eon x11, x12, x2 118 stp x10, x11, [rp,#-16] 119 PSHIFT x2, x6, cnt 120L(lo2): NSHIFT x10, x4, tnc 121 PSHIFT x13, x5, cnt 122 NSHIFT x12, x5, tnc 123 ldp x6, x7, [up,#-32]! 124 eon x10, x10, x13 125 eon x11, x12, x2 126 stp x10, x11, [rp,#-32]! 127 PSHIFT x2, x4, cnt 128L(lo0): sub x18, x18, #1 129L(lo3): NSHIFT x10, x6, tnc 130 PSHIFT x13, x7, cnt 131 NSHIFT x12, x7, tnc 132 cbnz x18, L(top) 133 134L(end): eon x10, x10, x13 135 eon x11, x12, x2 136 PSHIFT x2, x6, cnt 137 stp x10, x11, [rp,#-16] 138 mvn x2, x2 139 str x2, [rp,#-24] 140 ret 141EPILOGUE() 142