1dnl Power9 mpn_mul_basecase. 2 3dnl Copyright 1999-2001, 2003-2006, 2008, 2017-2018 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C POWER3/PPC630 - 36C POWER4/PPC970 - 37C POWER5 - 38C POWER6 - 39C POWER7 - 40C POWER8 - 41C POWER9 1.62 42 43C TODO 44C * Check if (inner) loop alignment affects performance. 45C * Could we schedule loads less in addmul_2/mul_2? That would save some regs 46C and make the tail code more manageable. 47C * Postpone some register saves to main loop. 48C * Perhaps write more small operands (3x1, 3x2, 3x3) code. 49C * Consider restoring rp,up after loop using arithmetic, eliminating rp2, up2. 50C On the other hand, the current rp,up restore register are useful for OSP. 51C * Do OSP. This should save a lot with the current deep addmul_2 pipeline. 52 53C INPUT PARAMETERS 54define(`rp', `r3') 55define(`up', `r4') 56define(`un', `r5') 57define(`vp', `r6') 58define(`vn', `r7') 59 60define(`v0', `r0') 61define(`v1', `r7') 62define(`rp2', `r24') 63define(`up2', `r25') 64 65ASM_START() 66PROLOGUE(mpn_mul_basecase) 67 cmpdi cr0, un, 2 68 bgt cr0, L(un_gt2) 69 cmpdi cr6, vn, 1 70 ld r7, 0(vp) 71 ld r5, 0(up) 72 mulld r8, r5, r7 C weight 0 73 mulhdu r9, r5, r7 C weight 1 74 std r8, 0(rp) 75 beq cr0, L(2x) 76 std r9, 8(rp) 77 blr 78 ALIGN(16) 79L(2x): ld r0, 8(up) 80 mulld r8, r0, r7 C weight 1 81 mulhdu r10, r0, r7 C weight 2 82 addc r9, r9, r8 83 addze r10, r10 84 bne cr6, L(2x2) 85 std r9, 8(rp) 86 std r10, 16(rp) 87 blr 88 ALIGN(16) 89L(2x2): ld r6, 8(vp) 90 mulld r8, r5, r6 C weight 1 91 mulhdu r11, r5, r6 C weight 2 92 addc r9, r9, r8 93 std r9, 8(rp) 94 adde r11, r11, r10 95 mulld r12, r0, r6 C weight 2 96 mulhdu r0, r0, r6 C weight 3 97 addze r0, r0 98 addc r11, r11, r12 99 addze r0, r0 100 std r11, 16(rp) 101 std r0, 24(rp) 102 blr 103 104L(un_gt2): 105 std r22, -80(r1) 106 std r23, -72(r1) 107 std r24, -64(r1) 108 std r25, -56(r1) 109 std r26, -48(r1) 110 std r27, -40(r1) 111 std r28, -32(r1) 112 std r29, -24(r1) 113 std r30, -16(r1) 114 std r31, -8(r1) 115 mr rp2, r3 C rp 116 mr up2, r4 C up 117 srdi r22, r5, 2 C un 118 subfic r23, r7, 0 C -vn, clear CA 119 subfo r0, r0, r0 C clear OV (and r0) 120 121 cmpdi cr6, un, 3 122 rldicl r0, un, 0, 63 C r0 = un & 1 123 cmpdi cr7, r0, 0 124 rldicl r0, un, 63, 63 C FIXME: unused for vn = 1 125 cmpdi cr5, r0, 0 C FIXME: unused for vn = 1 126 127 ld v0, 0(vp) 128 rldicl. r9, vn, 0, 63 129 beq cr0, L(vn_evn) 130 131L(vn_odd): 132 addi r10, un, -2 133 ld r5, 0(up) 134 srdi r10, r10, 1 135 mtctr r10 136 bne cr7, L(m1_b1) 137 138L(m1_b0): 139 ld r10, 8(up) 140 mulld r9, r5, v0 141 mulhdu r11, r5, v0 142 ld r12, 16(up) 143 mulld r8, r10, v0 144 mulhdu r5, r10, v0 145 addi rp, rp, -8 146 b L(m1_mid) 147 148L(m1_b1): 149 ld r12, 8(up) 150 mulld r8, r5, v0 151 mulhdu r5, r5, v0 152 ld r10, 16(up) 153 mulld r9, r12, v0 154 mulhdu r11, r12, v0 155 addi up, up, 8 156 beq cr6, L(m1_end) C jump taken means un = 3, vn = {1,3} 157 158 ALIGN(16) 159L(m1_top): 160 ld r12, 16(up) 161 std r8, 0(rp) 162 adde r9, r5, r9 163 mulld r8, r10, v0 164 mulhdu r5, r10, v0 165L(m1_mid): 166 ld r10, 24(up) 167 std r9, 8(rp) 168 adde r8, r11, r8 169 mulld r9, r12, v0 170 mulhdu r11, r12, v0 171 addi rp, rp, 16 172 addi up, up, 16 173 bdnz L(m1_top) 174 175L(m1_end): 176 std r8, 0(rp) 177 mulld r8, r10, v0 178 adde r9, r5, r9 179 mulhdu r5, r10, v0 180 std r9, 8(rp) 181 adde r8, r11, r8 182 std r8, 16(rp) 183 addze r10, r5 184 std r10, 24(rp) 185 186 addi rp2, rp2, 8 187 addi vp, vp, 8 188 addic. r23, r23, 1 189 b L(do_outer) 190 191L(vn_evn): 192 ld v1, 8(vp) 193 addi r23, r23, 2 194 mtctr r22 195 bne cr7, L(m2_bx1) 196 197L(m2_bx0): 198 ld r8, 0(up) 199 ld r9, 8(up) 200 li r11, 0 201 mulld r28, r8, v0 202 mulhdu r31, r8, v0 203 mulld r5, r8, v1 204 mulhdu r10, r8, v1 205 li r12, 0 206 bne cr5, L(m2_b10) 207 208L(m2_b00): 209 addi up, up, -8 210 addi rp, rp, -24 211 b L(m2_lo0) 212 213L(m2_b10): 214 addi up, up, 8 215 addi rp, rp, -8 216 b L(m2_lo2) 217 218L(m2_bx1): 219 ld r9, 0(up) 220 ld r8, 8(up) 221 li r10, 0 222 mulld r29, r9, v0 223 mulhdu r30, r9, v0 224 mulld r12, r9, v1 225 mulhdu r11, r9, v1 226 li r5, 0 227 bne cr5, L(m2_b11) 228 229L(m2_b01): 230 addi rp, rp, -16 231 b L(m2_lo1) 232L(m2_b11): 233 addi up, up, 16 234 beq cr6, L(m2_end) C taken means un = 3, vn = 2. We're done. 235 236L(m2_top): 237 ld r9, 0(up) 238 maddld( r28, r8, v0, r10) 239 maddhdu(r31, r8, v0, r10) 240 adde r5, r29, r5 241 std r5, 0(rp) 242 mulld r5, r8, v1 243 mulhdu r10, r8, v1 244 addex( r12, r12, r30, 0) 245L(m2_lo2): 246 ld r8, 8(up) 247 maddld( r29, r9, v0, r11) 248 maddhdu(r30, r9, v0, r11) 249 adde r12, r28, r12 250 std r12, 8(rp) 251 mulld r12, r9, v1 252 mulhdu r11, r9, v1 253 addex( r5, r5, r31, 0) 254L(m2_lo1): 255 ld r9, 16(up) 256 maddld( r28, r8, v0, r10) 257 maddhdu(r31, r8, v0, r10) 258 adde r5, r29, r5 259 std r5, 16(rp) 260 mulld r5, r8, v1 261 mulhdu r10, r8, v1 262 addex( r12, r12, r30, 0) 263L(m2_lo0): 264 ld r8, 24(up) 265 maddld( r29, r9, v0, r11) 266 maddhdu(r30, r9, v0, r11) 267 adde r12, r28, r12 268 std r12, 24(rp) 269 mulld r12, r9, v1 270 mulhdu r11, r9, v1 271 addex( r5, r5, r31, 0) 272 addi up, up, 32 273 addi rp, rp, 32 274 bdnz L(m2_top) 275 276L(m2_end): 277 ld r9, 0(up) 278 maddld( r28, r8, v0, r10) 279 maddhdu(r31, r8, v0, r10) 280 adde r5, r29, r5 281 std r5, 0(rp) 282 mulld r5, r8, v1 283 mulhdu r10, r8, v1 284 b L(cj) 285 286L(outer): 287 ld v0, 0(vp) 288 ld v1, 8(vp) 289 addi r23, r23, 2 290 mtctr r22 291 bne cr7, L(bx1) 292 293L(bx0): ld r26, 0(rp2) 294 ld r8, 0(up2) 295 ld r11, 8(rp2) 296 ld r9, 8(up2) 297 maddld( r28, r8, v0, r26) 298 maddhdu(r31, r8, v0, r26) 299 ld r26, 16(rp2) 300 mulld r5, r8, v1 301 mulhdu r10, r8, v1 302 li r12, 0 303 bne cr5, L(b10) 304 305L(b00): addi up, up2, -8 306 addi rp, rp2, -24 307 b L(lo0) 308 309L(b10): addi up, up2, 8 310 addi rp, rp2, -8 311 b L(lo2) 312 313L(bx1): ld r27, 0(rp2) 314 ld r9, 0(up2) 315 ld r10, 8(rp2) 316 ld r8, 8(up2) 317 maddld( r29, r9, v0, r27) 318 maddhdu(r30, r9, v0, r27) 319 ld r27, 16(rp2) 320 mulld r12, r9, v1 321 mulhdu r11, r9, v1 322 li r5, 0 323 bne cr5, L(b11) 324 325L(b01): addi up, up2, 0 326 addi rp, rp2, -16 327 b L(lo1) 328L(b11): addi up, up2, 16 329 addi rp, rp2, 0 330 beq cr6, L(end) C taken means un = 3, vn = 3. We're done. 331 332L(top): ld r9, 0(up) 333 maddld( r28, r8, v0, r10) 334 maddhdu(r31, r8, v0, r10) 335 adde r5, r29, r5 336 ld r26, 24(rp) 337 std r5, 0(rp) 338 maddld( r5, r8, v1, r27) 339 maddhdu(r10, r8, v1, r27) 340 addex( r12, r12, r30, 0) 341L(lo2): ld r8, 8(up) 342 maddld( r29, r9, v0, r11) 343 maddhdu(r30, r9, v0, r11) 344 adde r12, r28, r12 345 ld r27, 32(rp) 346 std r12, 8(rp) 347 maddld( r12, r9, v1, r26) 348 maddhdu(r11, r9, v1, r26) 349 addex( r5, r5, r31, 0) 350L(lo1): ld r9, 16(up) 351 maddld( r28, r8, v0, r10) 352 maddhdu(r31, r8, v0, r10) 353 adde r5, r29, r5 354 ld r26, 40(rp) 355 std r5, 16(rp) 356 maddld( r5, r8, v1, r27) 357 maddhdu(r10, r8, v1, r27) 358 addex( r12, r12, r30, 0) 359L(lo0): ld r8, 24(up) 360 maddld( r29, r9, v0, r11) 361 maddhdu(r30, r9, v0, r11) 362 adde r12, r28, r12 363 ld r27, 48(rp) 364 std r12, 24(rp) 365 maddld( r12, r9, v1, r26) 366 maddhdu(r11, r9, v1, r26) 367 addex( r5, r5, r31, 0) 368 addi up, up, 32 369 addi rp, rp, 32 370 bdnz L(top) 371 372L(end): ld r9, 0(up) 373 maddld( r28, r8, v0, r10) 374 maddhdu(r31, r8, v0, r10) 375 adde r5, r29, r5 376 std r5, 0(rp) 377 maddld( r5, r8, v1, r27) 378 maddhdu(r10, r8, v1, r27) 379L(cj): addex( r12, r12, r30, 0) 380 maddld( r29, r9, v0, r11) 381 maddhdu(r30, r9, v0, r11) 382 adde r12, r28, r12 383 std r12, 8(rp) 384 mulld r12, r9, v1 385 mulhdu r11, r9, v1 386 addex( r5, r5, r31, 0) 387 adde r5, r29, r5 388 std r5, 16(rp) 389 addex( r12, r12, r30, 0) 390 adde r12, r12, r10 391 std r12, 24(rp) 392 li r4, 0 393 addze r5, r11 394 addex( r5, r5, r4, 0) 395 std r5, 32(rp) 396 397 cmpdi cr0, r23, 0 398 addi rp2, rp2, 16 399 addi vp, vp, 16 400L(do_outer): 401 bne cr0, L(outer) 402L(ret): 403 ld r22, -80(r1) 404 ld r23, -72(r1) 405 ld r24, -64(r1) 406 ld r25, -56(r1) 407 ld r26, -48(r1) 408 ld r27, -40(r1) 409 ld r28, -32(r1) 410 ld r29, -24(r1) 411 ld r30, -16(r1) 412 ld r31, -8(r1) 413 blr 414EPILOGUE() 415ASM_END() 416