1dnl SPARC v9 mpn_mul_4 and mpn_addmul_4 for T3/T4/T5. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35 36C cycles/limb cycles/limb 37C mul_4 addmul_4 38C UltraSPARC T3: 21.5 22.0 39C UltraSPARC T4: 2.625 2.75 40 41 42C The code is well-scheduled and relies on OoO very little. There is hope that 43C this will run at around 2.5 and 2.75 c/l respectively, on T4. 44 45define(`rp', `%i0') 46define(`up', `%i1') 47define(`n', `%i2') 48define(`vp', `%i3') 49 50define(`v0', `%g1') 51define(`v1', `%o7') 52define(`v2', `%g2') 53define(`v3', `%i3') 54 55define(`w0', `%o0') 56define(`w1', `%o1') 57define(`w2', `%o2') 58define(`w3', `%o3') 59define(`w4', `%o4') 60 61define(`r0', `%o5') 62 63define(`u0', `%i4') 64define(`u1', `%i5') 65 66define(`rp0', `rp') 67define(`rp1', `%g3') 68define(`rp2', `%g4') 69define(`up0', `up') 70define(`up1', `%g5') 71 72ifdef(`OPERATION_mul_4',` 73 define(`AM4', `') 74 define(`ADDX', `addcc`'$1') 75 define(`func', `mpn_mul_4') 76') 77ifdef(`OPERATION_addmul_4',` 78 define(`AM4', `$1') 79 define(`ADDX', `addxccc($1,$2,$3)') 80 define(`func', `mpn_addmul_4') 81') 82 83 84MULFUNC_PROLOGUE(mpn_mul_4 mpn_addmul_4) 85 86ASM_START() 87 REGISTER(%g2,#scratch) 88 REGISTER(%g3,#scratch) 89PROLOGUE(func) 90 save %sp, -176, %sp 91 92 ldx [up + 0], u1 C load up[0] early 93 andcc n, 1, %g0 C is n odd? 94 ldx [vp + 0], v0 95 sllx n, 3, n 96 ldx [vp + 8], v1 97 add n, -28, n 98 ldx [vp + 16], v2 99 add rp, -16, rp 100 ldx [vp + 24], v3 101 add up, n, up0 102 add rp, n, rp0 103 add up0, 8, up1 104 add rp0, 8, rp1 105 add rp0, 16, rp2 106 mulx u1, v0, %l0 107 mov 0, w0 108 mulx u1, v1, %l1 109 mov 0, w1 110 mulx u1, v2, %l2 111 mov 0, w2 112 mulx u1, v3, %l3 113 mov 0, w3 114 115 be L(evn) 116 neg n, n 117 118L(odd): mov u1, u0 119 ldx [up1 + n], u1 120AM4(` ldx [rp2 + n], r0') 121 umulxhi(u0, v0, %l4) 122 umulxhi(u0, v1, %l5) 123 umulxhi(u0, v2, %l6) 124 umulxhi(u0, v3, %l7) 125 b L(mid) 126 add n, 8, n 127 128L(evn): ldx [up1 + n], u0 129AM4(` ldx [rp2 + n], r0') 130 umulxhi(u1, v0, %l4) 131 umulxhi(u1, v1, %l5) 132 umulxhi(u1, v2, %l6) 133 umulxhi(u1, v3, %l7) 134 add n, 16, n 135 136 ALIGN(16) 137L(top): addcc %l0, w0, w0 138 mulx u0, v0, %l0 C w 0 139 addxccc(%l1, w1, w1) 140 mulx u0, v1, %l1 C w 1 141 addxccc(%l2, w2, w2) 142 mulx u0, v2, %l2 C w 2 143 addxccc(%l3, w3, w3) 144 mulx u0, v3, %l3 C w 3 145 ldx [up0 + n], u1 146 addxc( %g0, %g0, w4) 147AM4(` addcc r0, w0, w0') 148 stx w0, [rp0 + n] 149 ADDX(` %l4, w1, w0') 150 umulxhi(u0, v0, %l4) C w 1 151AM4(` ldx [rp1 + n], r0') 152 addxccc(%l5, w2, w1) 153 umulxhi(u0, v1, %l5) C w 2 154 addxccc(%l6, w3, w2) 155 umulxhi(u0, v2, %l6) C w 3 156 addxc( %l7, w4, w3) 157 umulxhi(u0, v3, %l7) C w 4 158L(mid): addcc %l0, w0, w0 159 mulx u1, v0, %l0 C w 1 160 addxccc(%l1, w1, w1) 161 mulx u1, v1, %l1 C w 2 162 addxccc(%l2, w2, w2) 163 mulx u1, v2, %l2 C w 3 164 addxccc(%l3, w3, w3) 165 mulx u1, v3, %l3 C w 4 166 ldx [up1 + n], u0 167 addxc( %g0, %g0, w4) 168AM4(` addcc r0, w0, w0') 169 stx w0, [rp1 + n] 170 ADDX(` %l4, w1, w0') 171 umulxhi(u1, v0, %l4) C w 2 172AM4(` ldx [rp2 + n], r0') 173 addxccc(%l5, w2, w1) 174 umulxhi(u1, v1, %l5) C w 3 175 addxccc(%l6, w3, w2) 176 umulxhi(u1, v2, %l6) C w 4 177 addxc( %l7, w4, w3) 178 umulxhi(u1, v3, %l7) C w 5 179 brlz n, L(top) 180 add n, 16, n 181 182L(end): addcc %l0, w0, w0 183 mulx u0, v0, %l0 184 addxccc(%l1, w1, w1) 185 mulx u0, v1, %l1 186 addxccc(%l2, w2, w2) 187 mulx u0, v2, %l2 188 addxccc(%l3, w3, w3) 189 mulx u0, v3, %l3 190 addxc( %g0, %g0, w4) 191AM4(` addcc r0, w0, w0') 192 stx w0, [rp0 + n] 193 ADDX(` %l4, w1, w0') 194 umulxhi(u0, v0, %l4) 195AM4(` ldx [rp1 + n], r0') 196 addxccc(%l5, w2, w1) 197 umulxhi(u0, v1, %l5) 198 addxccc(%l6, w3, w2) 199 umulxhi(u0, v2, %l6) 200 addxc( %l7, w4, w3) 201 umulxhi(u0, v3, %l7) 202 addcc %l0, w0, w0 203 addxccc(%l1, w1, w1) 204 addxccc(%l2, w2, w2) 205 addxccc(%l3, w3, w3) 206 addxc( %g0, %g0, w4) 207AM4(` addcc r0, w0, w0') 208 stx w0, [rp1 + n] 209 ADDX(` %l4, w1, w0') 210 addxccc(%l5, w2, w1) 211 addxccc(%l6, w3, w2) 212 stx w0, [rp2 + n] 213 add n, 16, n 214 stx w1, [rp1 + n] 215 stx w2, [rp2 + n] 216 addxc( %l7, w4, %i0) 217 ret 218 restore 219EPILOGUE() 220