1dnl PowerPC-64 mpn_rshift -- rp[] = up[] >> cnt 2 3dnl Copyright 2003, 2005, 2010, 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C POWER3/PPC630 ? 24C POWER4/PPC970 ? 25C POWER5 2.25 26C POWER6 9.75 27C POWER7 2.15 28 29C TODO 30C * Try to reduce the number of needed live registers 31C * Micro-optimise header code 32C * Keep in synch with lshift.asm and lshiftc.asm 33 34C INPUT PARAMETERS 35define(`rp', `r3') 36define(`up', `r4') 37define(`n', `r5') 38define(`cnt', `r6') 39 40define(`tnc',`r0') 41define(`u0',`r30') 42define(`u1',`r31') 43define(`retval',`r5') 44 45ASM_START() 46PROLOGUE(mpn_rshift) 47 std r31, -8(r1) 48 std r30, -16(r1) 49 subfic tnc, cnt, 64 50C sldi r30, n, 3 C byte count corresponding to n 51C add rp, rp, r30 C rp = rp + n 52C add up, up, r30 C up = up + n 53 rldicl. r30, n, 0,62 C r30 = n & 3, set cr0 54 cmpdi cr6, r30, 2 55 addi r31, n, 3 C compute count... 56 ld r10, 0(up) C load 1st limb for b00...b11 57 sld retval, r10, tnc 58ifdef(`HAVE_ABI_mode32', 59` rldicl r31, r31, 62,34', C ...branch count 60` srdi r31, r31, 2') C ...for ctr 61 mtctr r31 C copy count into ctr 62 beq cr0, L(b00) 63 blt cr6, L(b01) 64 ld r11, 8(up) C load 2nd limb for b10 and b11 65 beq cr6, L(b10) 66 67 ALIGN(16) 68L(b11): srd r8, r10, cnt 69 sld r9, r11, tnc 70 ld u1, 16(up) 71 addi up, up, 24 72 srd r12, r11, cnt 73 sld r7, u1, tnc 74 addi rp, rp, -16 75 bdnz L(gt3) 76 77 or r11, r8, r9 78 srd r8, u1, cnt 79 b L(cj3) 80 81 ALIGN(16) 82L(gt3): ld u0, 0(up) 83 or r11, r8, r9 84 srd r8, u1, cnt 85 sld r9, u0, tnc 86 ld u1, 8(up) 87 or r10, r12, r7 88 b L(L11) 89 90 ALIGN(32) 91L(b10): srd r12, r10, cnt 92 addi rp, rp, -24 93 sld r7, r11, tnc 94 bdnz L(gt2) 95 96 srd r8, r11, cnt 97 or r10, r12, r7 98 b L(cj2) 99 100L(gt2): ld u0, 16(up) 101 srd r8, r11, cnt 102 sld r9, u0, tnc 103 ld u1, 24(up) 104 or r10, r12, r7 105 srd r12, u0, cnt 106 sld r7, u1, tnc 107 ld u0, 32(up) 108 or r11, r8, r9 109 addi up, up, 16 110 b L(L10) 111 112 ALIGN(16) 113L(b00): ld u1, 8(up) 114 srd r12, r10, cnt 115 sld r7, u1, tnc 116 ld u0, 16(up) 117 srd r8, u1, cnt 118 sld r9, u0, tnc 119 ld u1, 24(up) 120 or r10, r12, r7 121 srd r12, u0, cnt 122 sld r7, u1, tnc 123 addi rp, rp, -8 124 bdz L(cj4) 125 126L(gt4): addi up, up, 32 127 ld u0, 0(up) 128 or r11, r8, r9 129 b L(L00) 130 131 ALIGN(16) 132L(b01): bdnz L(gt1) 133 srd r8, r10, cnt 134 std r8, 0(rp) 135 b L(ret) 136 137L(gt1): ld u0, 8(up) 138 srd r8, r10, cnt 139 sld r9, u0, tnc 140 ld u1, 16(up) 141 srd r12, u0, cnt 142 sld r7, u1, tnc 143 ld u0, 24(up) 144 or r11, r8, r9 145 srd r8, u1, cnt 146 sld r9, u0, tnc 147 ld u1, 32(up) 148 addi up, up, 40 149 or r10, r12, r7 150 bdz L(end) 151 152 ALIGN(32) 153L(top): srd r12, u0, cnt 154 sld r7, u1, tnc 155 ld u0, 0(up) 156 std r11, 0(rp) 157 or r11, r8, r9 158L(L00): srd r8, u1, cnt 159 sld r9, u0, tnc 160 ld u1, 8(up) 161 std r10, 8(rp) 162 or r10, r12, r7 163L(L11): srd r12, u0, cnt 164 sld r7, u1, tnc 165 ld u0, 16(up) 166 std r11, 16(rp) 167 or r11, r8, r9 168L(L10): srd r8, u1, cnt 169 sld r9, u0, tnc 170 ld u1, 24(up) 171 addi up, up, 32 172 std r10, 24(rp) 173 addi rp, rp, 32 174 or r10, r12, r7 175 bdnz L(top) 176 177 ALIGN(32) 178L(end): srd r12, u0, cnt 179 sld r7, u1, tnc 180 std r11, 0(rp) 181L(cj4): or r11, r8, r9 182 srd r8, u1, cnt 183 std r10, 8(rp) 184L(cj3): or r10, r12, r7 185 std r11, 16(rp) 186L(cj2): std r10, 24(rp) 187 std r8, 32(rp) 188 189L(ret): ld r31, -8(r1) 190 ld r30, -16(r1) 191ifdef(`HAVE_ABI_mode32', 192` srdi r3, retval, 32 193 mr r4, retval 194',` mr r3, retval') 195 blr 196EPILOGUE() 197