1dnl S/390-32 mpn_sqr_basecase. 2 3dnl Copyright 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C cycles/limb 34C z900 ? 35C z990 23 36C z9 ? 37C z10 ? 38C z196 ? 39 40C TODO 41C * Clean up. 42C * Stop iterating addmul_1 loop at latest for n = 2, implement longer tail. 43C This will ask for basecase handling of n = 3. 44C * Update counters and pointers more straightforwardly, possibly lowering 45C register usage. 46C * Should we use this allocation-free style for more sqr_basecase asm 47C implementations? The only disadvantage is that it requires R != U. 48C * Replace loops by faster code. The mul_1 and addmul_1 loops could be sped 49C up by about 10%. The sqr_diag_addlsh1 loop could probably be sped up even 50C more. 51 52C INPUT PARAMETERS 53define(`rp', `%r2') 54define(`up', `%r3') 55define(`n', `%r4') 56 57define(`zero', `%r8') 58define(`rp_saved', `%r9') 59define(`up_saved', `%r13') 60define(`n_saved', `%r14') 61 62ASM_START() 63PROLOGUE(mpn_sqr_basecase) 64 ahi n, -2 65 jhe L(ge2) 66 67C n = 1 68 l %r5, 0(up) 69 mlr %r4, %r5 70 st %r5, 0(rp) 71 st %r4, 4(rp) 72 br %r14 73 74L(ge2): jne L(gen) 75 76C n = 2 77 stm %r6, %r8, 24(%r15) 78 lhi zero, 0 79 80 l %r5, 0(up) 81 mlr %r4, %r5 C u0 * u0 82 l %r1, 4(up) 83 mlr %r0, %r1 C u1 * u1 84 st %r5, 0(rp) 85 86 l %r7, 0(up) 87 ml %r6, 4(up) C u0 * u1 88 alr %r7, %r7 89 alcr %r6, %r6 90 alcr %r0, zero 91 92 alr %r4, %r7 93 alcr %r1, %r6 94 alcr %r0, zero 95 st %r4, 4(rp) 96 st %r1, 8(rp) 97 st %r0, 12(rp) 98 99 lm %r6, %r8, 24(%r15) 100 br %r14 101 102L(gen): 103C mul_1 ======================================================================= 104 105 stm %r6, %r14, 24(%r15) 106 lhi zero, 0 107 lr up_saved, up 108 lr rp_saved, rp 109 lr n_saved, n 110 111 l %r6, 0(up) 112 l %r11, 4(up) 113 lhi %r12, 8 C init index register 114 mlr %r10, %r6 115 lr %r5, n 116 st %r11, 4(rp) 117 cr %r15, %r15 C clear carry flag 118 119L(tm): l %r1, 0(%r12,up) 120 mlr %r0, %r6 121 alcr %r1, %r10 122 lr %r10, %r0 C copy high part to carry limb 123 st %r1, 0(%r12,rp) 124 la %r12, 4(%r12) 125 brct %r5, L(tm) 126 127 alcr %r0, zero 128 st %r0, 0(%r12,rp) 129 130C addmul_1 loop =============================================================== 131 132 ahi n, -1 133 je L(outer_end) 134L(outer_loop): 135 136 la rp, 8(rp) C rp += 2 137 la up, 4(up) C up += 1 138 l %r6, 0(up) 139 l %r11, 4(up) 140 lhi %r12, 8 C init index register 141 mlr %r10, %r6 142 lr %r5, n 143 al %r11, 4(rp) 144 st %r11, 4(rp) 145 146L(tam): l %r1, 0(%r12,up) 147 l %r7, 0(%r12,rp) 148 mlr %r0, %r6 149 alcr %r1, %r7 150 alcr %r0, zero 151 alr %r1, %r10 152 lr %r10, %r0 153 st %r1, 0(%r12,rp) 154 la %r12, 4(%r12) 155 brct %r5, L(tam) 156 157 alcr %r0, zero 158 st %r0, 0(%r12,rp) 159 160 brct n, L(outer_loop) 161L(outer_end): 162 163 l %r6, 4(up) 164 l %r1, 8(up) 165 lr %r7, %r0 C Same as: l %r7, 12(,rp) 166 mlr %r0, %r6 167 alr %r1, %r7 168 alcr %r0, zero 169 st %r1, 12(rp) 170 st %r0, 16(rp) 171 172C sqr_diag_addlsh1 ============================================================ 173 174define(`up', `up_saved') 175define(`rp', `rp_saved') 176 la n, 1(n_saved) 177 178 l %r1, 0(up) 179 mlr %r0, %r1 180 st %r1, 0(rp) 181C clr %r15, %r15 C clear carry (already clear per above) 182 183L(top): l %r11, 4(up) 184 la up, 4(up) 185 l %r6, 4(rp) 186 l %r7, 8(rp) 187 mlr %r10, %r11 188 alcr %r6, %r6 189 alcr %r7, %r7 190 alcr %r10, zero C propagate carry to high product limb 191 alr %r6, %r0 192 alcr %r7, %r11 193 stm %r6, %r7, 4(rp) 194 la rp, 8(rp) 195 lr %r0, %r10 C copy carry limb 196 brct n, L(top) 197 198 alcr %r0, zero 199 st %r0, 4(rp) 200 201 lm %r6, %r14, 24(%r15) 202 br %r14 203EPILOGUE() 204