1dnl S/390-64 mpn_sqr_basecase. 2 3dnl Copyright 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C z900 ? 35C z990 23 36C z9 ? 37C z10 28 38C z196 ? 39 40C TODO 41C * Clean up. 42C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. 43C This will ask for basecase handling of n = 3. 44C * Update counters and pointers more straightforwardly, possibly lowering 45C register usage. 46C * Should we use this allocation-free style for more sqr_basecase asm 47C implementations? The only disadvantage is that it requires R != U. 48C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped 49C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even 50C more. 51 52C INPUT PARAMETERS 53define(`rp', `%r2') 54define(`up', `%r3') 55define(`n', `%r4') 56 57define(`zero', `%r8') 58define(`rp_saved', `%r9') 59define(`up_saved', `%r13') 60define(`n_saved', `%r14') 61 62ASM_START() 63PROLOGUE(mpn_sqr_basecase) 64 aghi n, -2 65 jhe L(ge2) 66 67C n = 1 68 lg %r5, 0(up) 69 mlgr %r4, %r5 70 stg %r5, 0(rp) 71 stg %r4, 8(rp) 72 br %r14 73 74L(ge2): jne L(gen) 75 76C n = 2 77 stmg %r6, %r8, 48(%r15) 78 lghi zero, 0 79 80 lg %r5, 0(up) 81 mlgr %r4, %r5 C u0 * u0 82 lg %r1, 8(up) 83 mlgr %r0, %r1 C u1 * u1 84 stg %r5, 0(rp) 85 86 lg %r7, 0(up) 87 mlg %r6, 8(up) C u0 * u1 88 algr %r7, %r7 89 alcgr %r6, %r6 90 alcgr %r0, zero 91 92 algr %r4, %r7 93 alcgr %r1, %r6 94 alcgr %r0, zero 95 stg %r4, 8(rp) 96 stg %r1, 16(rp) 97 stg %r0, 24(rp) 98 99 lmg %r6, %r8, 48(%r15) 100 br %r14 101 102L(gen): 103C mul_1 ======================================================================= 104 105 stmg %r6, %r14, 48(%r15) 106 lghi zero, 0 107 lgr up_saved, up 108 lgr rp_saved, rp 109 lgr n_saved, n 110 111 lg %r6, 0(up) 112 lg %r11, 8(up) 113 lghi %r12, 16 C init index register 114 mlgr %r10, %r6 115 lgr %r5, n 116 stg %r11, 8(rp) 117 cr %r15, %r15 C clear carry flag 118 119L(tm): lg %r1, 0(%r12,up) 120 mlgr %r0, %r6 121 alcgr %r1, %r10 122 lgr %r10, %r0 C copy high part to carry limb 123 stg %r1, 0(%r12,rp) 124 la %r12, 8(%r12) 125 brctg %r5, L(tm) 126 127 alcgr %r0, zero 128 stg %r0, 0(%r12,rp) 129 130C addmul_1 loop =============================================================== 131 132 aghi n, -1 133 je L(outer_end) 134L(outer_loop): 135 136 la rp, 16(rp) C rp += 2 137 la up, 8(up) C up += 1 138 lg %r6, 0(up) 139 lg %r11, 8(up) 140 lghi %r12, 16 C init index register 141 mlgr %r10, %r6 142 lgr %r5, n 143 alg %r11, 8(rp) 144 stg %r11, 8(rp) 145 146L(tam): lg %r1, 0(%r12,up) 147 lg %r7, 0(%r12,rp) 148 mlgr %r0, %r6 149 alcgr %r1, %r7 150 alcgr %r0, zero 151 algr %r1, %r10 152 lgr %r10, %r0 153 stg %r1, 0(%r12,rp) 154 la %r12, 8(%r12) 155 brctg %r5, L(tam) 156 157 alcgr %r0, zero 158 stg %r0, 0(%r12,rp) 159 160 brctg n, L(outer_loop) 161L(outer_end): 162 163 lg %r6, 8(up) 164 lg %r1, 16(up) 165 lgr %r7, %r0 C Same as: lg %r7, 24(,rp) 166 mlgr %r0, %r6 167 algr %r1, %r7 168 alcgr %r0, zero 169 stg %r1, 24(rp) 170 stg %r0, 32(rp) 171 172C sqr_diag_addlsh1 ============================================================ 173 174define(`up', `up_saved') 175define(`rp', `rp_saved') 176 la n, 1(n_saved) 177 178 lg %r1, 0(up) 179 mlgr %r0, %r1 180 stg %r1, 0(rp) 181C clr %r15, %r15 C clear carry (already clear per above) 182 183L(top): lg %r11, 8(up) 184 la up, 8(up) 185 lg %r6, 8(rp) 186 lg %r7, 16(rp) 187 mlgr %r10, %r11 188 alcgr %r6, %r6 189 alcgr %r7, %r7 190 alcgr %r10, zero C propagate carry to high product limb 191 algr %r6, %r0 192 alcgr %r7, %r11 193 stmg %r6, %r7, 8(rp) 194 la rp, 16(rp) 195 lgr %r0, %r10 C copy carry limb 196 brctg n, L(top) 197 198 alcgr %r0, zero 199 stg %r0, 8(rp) 200 201 lmg %r6, %r14, 48(%r15) 202 br %r14 203EPILOGUE() 204