1*72c7faa4Smrgdnl ARM64 mpn_rsh1add_n and mpn_rsh1sub_n. 2*72c7faa4Smrg 3*72c7faa4Smrgdnl Contributed to the GNU project by Torbjörn Granlund. 4*72c7faa4Smrg 5*72c7faa4Smrgdnl Copyright 2017 Free Software Foundation, Inc. 6*72c7faa4Smrg 7*72c7faa4Smrgdnl This file is part of the GNU MP Library. 8*72c7faa4Smrgdnl 9*72c7faa4Smrgdnl The GNU MP Library is free software; you can redistribute it and/or modify 10*72c7faa4Smrgdnl it under the terms of either: 11*72c7faa4Smrgdnl 12*72c7faa4Smrgdnl * the GNU Lesser General Public License as published by the Free 13*72c7faa4Smrgdnl Software Foundation; either version 3 of the License, or (at your 14*72c7faa4Smrgdnl option) any later version. 15*72c7faa4Smrgdnl 16*72c7faa4Smrgdnl or 17*72c7faa4Smrgdnl 18*72c7faa4Smrgdnl * the GNU General Public License as published by the Free Software 19*72c7faa4Smrgdnl Foundation; either version 2 of the License, or (at your option) any 20*72c7faa4Smrgdnl later version. 21*72c7faa4Smrgdnl 22*72c7faa4Smrgdnl or both in parallel, as here. 23*72c7faa4Smrgdnl 24*72c7faa4Smrgdnl The GNU MP Library is distributed in the hope that it will be useful, but 25*72c7faa4Smrgdnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26*72c7faa4Smrgdnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27*72c7faa4Smrgdnl for more details. 28*72c7faa4Smrgdnl 29*72c7faa4Smrgdnl You should have received copies of the GNU General Public License and the 30*72c7faa4Smrgdnl GNU Lesser General Public License along with the GNU MP Library. If not, 31*72c7faa4Smrgdnl see https://www.gnu.org/licenses/. 32*72c7faa4Smrg 33*72c7faa4Smrginclude(`../config.m4') 34*72c7faa4Smrg 35*72c7faa4SmrgC cycles/limb assumed optimal c/l 36*72c7faa4SmrgC Cortex-A53 3.25-3.75 3.0 steady 37*72c7faa4SmrgC Cortex-A57 2.15 1.75 38*72c7faa4SmrgC X-Gene 2.75 2.5 39*72c7faa4Smrg 40*72c7faa4Smrgchangecom(blah) 41*72c7faa4Smrg 42*72c7faa4Smrgdefine(`rp', `x0') 43*72c7faa4Smrgdefine(`up', `x1') 44*72c7faa4Smrgdefine(`vp', `x2') 45*72c7faa4Smrgdefine(`n', `x3') 46*72c7faa4Smrg 47*72c7faa4Smrgifdef(`OPERATION_rsh1add_n', ` 48*72c7faa4Smrg define(`ADDSUB', adds) 49*72c7faa4Smrg define(`ADDSUBC', adcs) 50*72c7faa4Smrg define(`COND', `cs') 51*72c7faa4Smrg define(`func_n', mpn_rsh1add_n)') 52*72c7faa4Smrgifdef(`OPERATION_rsh1sub_n', ` 53*72c7faa4Smrg define(`ADDSUB', subs) 54*72c7faa4Smrg define(`ADDSUBC', sbcs) 55*72c7faa4Smrg define(`COND', `cc') 56*72c7faa4Smrg define(`func_n', mpn_rsh1sub_n)') 57*72c7faa4Smrg 58*72c7faa4SmrgMULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n) 59*72c7faa4Smrg 60*72c7faa4SmrgASM_START() 61*72c7faa4SmrgPROLOGUE(func_n) 62*72c7faa4Smrg lsr x18, n, #2 63*72c7faa4Smrg 64*72c7faa4Smrg tbz n, #0, L(bx0) 65*72c7faa4Smrg 66*72c7faa4SmrgL(bx1): ldr x5, [up],#8 67*72c7faa4Smrg ldr x9, [vp],#8 68*72c7faa4Smrg tbnz n, #1, L(b11) 69*72c7faa4Smrg 70*72c7faa4SmrgL(b01): ADDSUB x13, x5, x9 71*72c7faa4Smrg and x10, x13, #1 72*72c7faa4Smrg cbz x18, L(1) 73*72c7faa4Smrg ldp x4, x5, [up],#48 74*72c7faa4Smrg ldp x8, x9, [vp],#48 75*72c7faa4Smrg ADDSUBC x14, x4, x8 76*72c7faa4Smrg ADDSUBC x15, x5, x9 77*72c7faa4Smrg ldp x4, x5, [up,#-32] 78*72c7faa4Smrg ldp x8, x9, [vp,#-32] 79*72c7faa4Smrg extr x17, x14, x13, #1 80*72c7faa4Smrg ADDSUBC x12, x4, x8 81*72c7faa4Smrg ADDSUBC x13, x5, x9 82*72c7faa4Smrg str x17, [rp], #24 83*72c7faa4Smrg sub x18, x18, #1 84*72c7faa4Smrg cbz x18, L(end) 85*72c7faa4Smrg b L(top) 86*72c7faa4Smrg 87*72c7faa4SmrgL(1): cset x14, COND 88*72c7faa4Smrg extr x17, x14, x13, #1 89*72c7faa4Smrg str x17, [rp] 90*72c7faa4Smrg mov x0, x10 91*72c7faa4Smrg ret 92*72c7faa4Smrg 93*72c7faa4SmrgL(b11): ADDSUB x15, x5, x9 94*72c7faa4Smrg and x10, x15, #1 95*72c7faa4Smrg 96*72c7faa4Smrg ldp x4, x5, [up],#32 97*72c7faa4Smrg ldp x8, x9, [vp],#32 98*72c7faa4Smrg ADDSUBC x12, x4, x8 99*72c7faa4Smrg ADDSUBC x13, x5, x9 100*72c7faa4Smrg cbz x18, L(3) 101*72c7faa4Smrg ldp x4, x5, [up,#-16] 102*72c7faa4Smrg ldp x8, x9, [vp,#-16] 103*72c7faa4Smrg extr x17, x12, x15, #1 104*72c7faa4Smrg ADDSUBC x14, x4, x8 105*72c7faa4Smrg ADDSUBC x15, x5, x9 106*72c7faa4Smrg str x17, [rp], #8 107*72c7faa4Smrg b L(mid) 108*72c7faa4Smrg 109*72c7faa4SmrgL(3): extr x17, x12, x15, #1 110*72c7faa4Smrg str x17, [rp], #8 111*72c7faa4Smrg b L(2) 112*72c7faa4Smrg 113*72c7faa4SmrgL(bx0): tbz n, #1, L(b00) 114*72c7faa4Smrg 115*72c7faa4SmrgL(b10): ldp x4, x5, [up],#32 116*72c7faa4Smrg ldp x8, x9, [vp],#32 117*72c7faa4Smrg ADDSUB x12, x4, x8 118*72c7faa4Smrg ADDSUBC x13, x5, x9 119*72c7faa4Smrg and x10, x12, #1 120*72c7faa4Smrg cbz x18, L(2) 121*72c7faa4Smrg ldp x4, x5, [up,#-16] 122*72c7faa4Smrg ldp x8, x9, [vp,#-16] 123*72c7faa4Smrg ADDSUBC x14, x4, x8 124*72c7faa4Smrg ADDSUBC x15, x5, x9 125*72c7faa4Smrg b L(mid) 126*72c7faa4Smrg 127*72c7faa4SmrgL(b00): ldp x4, x5, [up],#48 128*72c7faa4Smrg ldp x8, x9, [vp],#48 129*72c7faa4Smrg ADDSUB x14, x4, x8 130*72c7faa4Smrg ADDSUBC x15, x5, x9 131*72c7faa4Smrg and x10, x14, #1 132*72c7faa4Smrg ldp x4, x5, [up,#-32] 133*72c7faa4Smrg ldp x8, x9, [vp,#-32] 134*72c7faa4Smrg ADDSUBC x12, x4, x8 135*72c7faa4Smrg ADDSUBC x13, x5, x9 136*72c7faa4Smrg add rp, rp, #16 137*72c7faa4Smrg sub x18, x18, #1 138*72c7faa4Smrg cbz x18, L(end) 139*72c7faa4Smrg 140*72c7faa4Smrg ALIGN(16) 141*72c7faa4SmrgL(top): ldp x4, x5, [up,#-16] 142*72c7faa4Smrg ldp x8, x9, [vp,#-16] 143*72c7faa4Smrg extr x16, x15, x14, #1 144*72c7faa4Smrg extr x17, x12, x15, #1 145*72c7faa4Smrg ADDSUBC x14, x4, x8 146*72c7faa4Smrg ADDSUBC x15, x5, x9 147*72c7faa4Smrg stp x16, x17, [rp,#-16] 148*72c7faa4SmrgL(mid): ldp x4, x5, [up],#32 149*72c7faa4Smrg ldp x8, x9, [vp],#32 150*72c7faa4Smrg extr x16, x13, x12, #1 151*72c7faa4Smrg extr x17, x14, x13, #1 152*72c7faa4Smrg ADDSUBC x12, x4, x8 153*72c7faa4Smrg ADDSUBC x13, x5, x9 154*72c7faa4Smrg stp x16, x17, [rp],#32 155*72c7faa4Smrg sub x18, x18, #1 156*72c7faa4Smrg cbnz x18, L(top) 157*72c7faa4Smrg 158*72c7faa4SmrgL(end): extr x16, x15, x14, #1 159*72c7faa4Smrg extr x17, x12, x15, #1 160*72c7faa4Smrg stp x16, x17, [rp,#-16] 161*72c7faa4SmrgL(2): cset x14, COND 162*72c7faa4Smrg extr x16, x13, x12, #1 163*72c7faa4Smrg extr x17, x14, x13, #1 164*72c7faa4Smrg stp x16, x17, [rp] 165*72c7faa4Smrg 166*72c7faa4SmrgL(ret): mov x0, x10 167*72c7faa4Smrg ret 168*72c7faa4SmrgEPILOGUE() 169