1dnl S/390-64 mpn_sqr_basecase. 2 3dnl Copyright 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C cycles/limb 23C z900 ? 24C z990 23 25C z9 ? 26C z10 ? 27C z196 ? 28 29C TODO 30C * Clean up. 31C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. 32C This will ask for basecase handling of n = 3. 33C * Update counters and pointers more straightforwardly, possibly lowering 34C register usage. 35C * Should we use this allocation-free style for more sqr_basecase asm 36C implementations? The only disadvantage is that it requires R != U. 37C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped 38C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even 39C more. 40 41C INPUT PARAMETERS 42define(`rp', `%r2') 43define(`up', `%r3') 44define(`n', `%r4') 45 46define(`zero', `%r8') 47define(`rp_saved', `%r9') 48define(`up_saved', `%r13') 49define(`n_saved', `%r14') 50 51ASM_START() 52PROLOGUE(mpn_sqr_basecase) 53 aghi n, -2 54 jhe L(ge2) 55 56C n = 1 57 lg %r5, 0(up) 58 mlgr %r4, %r5 59 stg %r5, 0(rp) 60 stg %r4, 8(rp) 61 br %r14 62 63L(ge2): jne L(gen) 64 65C n = 2 66 stmg %r6, %r8, 48(%r15) 67 lghi zero, 0 68 69 lg %r5, 0(up) 70 mlgr %r4, %r5 C u0 * u0 71 lg %r1, 8(up) 72 mlgr %r0, %r1 C u1 * u1 73 stg %r5, 0(rp) 74 75 lg %r7, 0(up) 76 mlg %r6, 8(up) C u0 * u1 77 algr %r7, %r7 78 alcgr %r6, %r6 79 alcgr %r0, zero 80 81 algr %r4, %r7 82 alcgr %r1, %r6 83 alcgr %r0, zero 84 stg %r4, 8(rp) 85 stg %r1, 16(rp) 86 stg %r0, 24(rp) 87 88 lmg %r6, %r8, 48(%r15) 89 br %r14 90 91L(gen): 92C mul_1 ======================================================================= 93 94 stmg %r6, %r14, 48(%r15) 95 lghi zero, 0 96 lgr up_saved, up 97 lgr rp_saved, rp 98 lgr n_saved, n 99 100 lg %r6, 0(up) 101 lg %r11, 8(up) 102 lghi %r12, 16 C init index register 103 mlgr %r10, %r6 104 lgr %r5, n 105 stg %r11, 8(rp) 106 cr %r15, %r15 C clear carry flag 107 108L(tm): lg %r1, 0(%r12,up) 109 mlgr %r0, %r6 110 alcgr %r1, %r10 111 lgr %r10, %r0 C copy high part to carry limb 112 stg %r1, 0(%r12,rp) 113 la %r12, 8(%r12) 114 brctg %r5, L(tm) 115 116 alcgr %r0, zero 117 stg %r0, 0(%r12,rp) 118 119C addmul_1 loop =============================================================== 120 121 aghi n, -1 122 je L(outer_end) 123L(outer_loop): 124 125 la rp, 16(rp) C rp += 2 126 la up, 8(up) C up += 1 127 lg %r6, 0(up) 128 lg %r11, 8(up) 129 lghi %r12, 16 C init index register 130 mlgr %r10, %r6 131 lgr %r5, n 132 alg %r11, 8(rp) 133 stg %r11, 8(rp) 134 135L(tam): lg %r1, 0(%r12,up) 136 lg %r7, 0(%r12,rp) 137 mlgr %r0, %r6 138 alcgr %r1, %r7 139 alcgr %r0, zero 140 algr %r1, %r10 141 lgr %r10, %r0 142 stg %r1, 0(%r12,rp) 143 la %r12, 8(%r12) 144 brctg %r5, L(tam) 145 146 alcgr %r0, zero 147 stg %r0, 0(%r12,rp) 148 149 brctg n, L(outer_loop) 150L(outer_end): 151 152 lg %r6, 8(up) 153 lg %r1, 16(up) 154 lgr %r7, %r0 C Same as: lg %r7, 24(,rp) 155 mlgr %r0, %r6 156 algr %r1, %r7 157 alcgr %r0, zero 158 stg %r1, 24(rp) 159 stg %r0, 32(rp) 160 161C sqr_diag_addlsh1 ============================================================ 162 163define(`up', `up_saved') 164define(`rp', `rp_saved') 165 la n, 1(n_saved) 166 167 lg %r1, 0(up) 168 mlgr %r0, %r1 169 stg %r1, 0(rp) 170C clr %r15, %r15 C clear carry (already clear per above) 171 172L(top): lg %r11, 8(up) 173 la up, 8(up) 174 lg %r6, 8(rp) 175 lg %r7, 16(rp) 176 mlgr %r10, %r11 177 alcgr %r6, %r6 178 alcgr %r7, %r7 179 alcgr %r10, zero C propagate carry to high product limb 180 algr %r6, %r0 181 alcgr %r7, %r11 182 stmg %r6, %r7, 8(rp) 183 la rp, 16(rp) 184 lgr %r0, %r10 C copy carry limb 185 brctg n, L(top) 186 187 alcgr %r0, zero 188 stg %r0, 8(rp) 189 190 lmg %r6, %r14, 48(%r15) 191 br %r14 192EPILOGUE() 193