1dnl AMD64 mpn_mul_basecase optimised for AMD Bulldozer and Piledriver. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb mul_1 mul_2 mul_3 addmul_2 36C AMD K8,K9 37C AMD K10 38C AMD bull ~4.8 ~4.55 - ~4.3 39C AMD pile ~4.6 ~4.55 - ~4.55 40C AMD bobcat 41C AMD jaguar 42C Intel P4 43C Intel core 44C Intel NHM 45C Intel SBR 46C Intel IBR 47C Intel HWL 48C Intel BWL 49C Intel atom 50C VIA nano 51 52C The inner loops of this code are the result of running a code generation and 53C optimisation tool suite written by David Harvey and Torbjorn Granlund. 54 55C TODO 56C * Merge bull-specific mul_1, if it is not slower the TOOM22 range. 57C Alternatively, we could tweak the present code (which was loopmixed for a 58C different CPU). 59C * Merge faster mul_2, such as the one in the same directory as this file. 60C * Further micro-optimise. 61 62C When playing with pointers, set this to $2 to fall back to conservative 63C indexing in wind-down code. 64define(`I',`$1') 65 66 67define(`rp', `%rdi') 68define(`up', `%rsi') 69define(`un_param',`%rdx') 70define(`vp', `%rcx') 71define(`vn', `%r8') 72 73define(`un', `%rbx') 74 75define(`w0', `%r10') 76define(`w1', `%r11') 77define(`w2', `%r12') 78define(`w3', `%r13') 79define(`n', `%rbp') 80define(`v0', `%r9') 81 82ABI_SUPPORT(DOS64) 83ABI_SUPPORT(STD64) 84 85ASM_START() 86 TEXT 87 ALIGN(16) 88PROLOGUE(mpn_mul_basecase) 89 FUNC_ENTRY(4) 90IFDOS(` mov 56(%rsp), %r8d ') 91 push %rbx 92 push %rbp 93 mov un_param, un C free up rdx 94 neg un 95 96 mov (up), %rax C shared for mul_1 and mul_2 97 lea (up,un_param,8), up C point at operand end 98 lea (rp,un_param,8), rp C point at rp[un-1] 99 100 mov (vp), v0 C shared for mul_1 and mul_2 101 mul v0 C shared for mul_1 and mul_2 102 103 test $1, R8(vn) 104 jz L(do_mul_2) 105 106L(do_mul_1): 107 test $1, R8(un) 108 jnz L(m1x1) 109 110L(m1x0):mov %rax, w0 C un = 2, 4, 6, 8, ... 111 mov %rdx, w1 112 mov 8(up,un,8), %rax 113 test $2, R8(un) 114 jnz L(m110) 115 116L(m100):lea 2(un), n C un = 4, 8, 12, ... 117 jmp L(m1l0) 118 119L(m110):lea (un), n C un = 2, 6, 10, ... 120 jmp L(m1l2) 121 122L(m1x1):mov %rax, w1 C un = 1, 3, 5, 7, ... 123 mov %rdx, w0 124 test $2, R8(un) 125 jz L(m111) 126 127L(m101):lea 3(un), n C un = 1, 5, 9, ... 128 test n, n 129 js L(m1l1) 130 mov %rax, -8(rp) 131 mov %rdx, (rp) 132 pop %rbp 133 pop %rbx 134 FUNC_EXIT() 135 ret 136 137L(m111):lea 1(un), n C un = 3, 7, 11, ... 138 mov 8(up,un,8), %rax 139 jmp L(m1l3) 140 141 ALIGN(16) 142L(m1tp):mov %rdx, w0 143 add %rax, w1 144L(m1l1):mov -16(up,n,8), %rax 145 adc $0, w0 146 mul v0 147 add %rax, w0 148 mov w1, -24(rp,n,8) 149 mov -8(up,n,8), %rax 150 mov %rdx, w1 151 adc $0, w1 152L(m1l0):mul v0 153 mov w0, -16(rp,n,8) 154 add %rax, w1 155 mov %rdx, w0 156 mov (up,n,8), %rax 157 adc $0, w0 158L(m1l3):mul v0 159 mov w1, -8(rp,n,8) 160 mov %rdx, w1 161 add %rax, w0 162 mov 8(up,n,8), %rax 163 adc $0, w1 164L(m1l2):mul v0 165 mov w0, (rp,n,8) 166 add $4, n 167 jnc L(m1tp) 168 169L(m1ed):add %rax, w1 170 adc $0, %rdx 171 mov w1, I(-8(rp),-24(rp,n,8)) 172 mov %rdx, I((rp),-16(rp,n,8)) 173 174 dec R32(vn) 175 jz L(ret2) 176 177 lea 8(vp), vp 178 lea 8(rp), rp 179 push %r12 180 push %r13 181 push %r14 182 jmp L(do_addmul) 183 184L(do_mul_2): 185define(`v1', `%r14') 186 push %r12 187 push %r13 188 push %r14 189 190 mov 8(vp), v1 191 192 test $1, R8(un) 193 jnz L(m2b1) 194 195L(m2b0):lea (un), n 196 mov %rax, w2 C 0 197 mov (up,un,8), %rax 198 mov %rdx, w1 C 1 199 mul v1 200 mov %rax, w0 C 1 201 mov w2, (rp,un,8) C 0 202 mov 8(up,un,8), %rax 203 mov %rdx, w2 C 2 204 jmp L(m2l0) 205 206L(m2b1):lea 1(un), n 207 mov %rax, w0 C 1 208 mov %rdx, w3 C 2 209 mov (up,un,8), %rax 210 mul v1 211 mov w0, (rp,un,8) C 1 212 mov %rdx, w0 C 3 213 mov %rax, w2 C 0 214 mov 8(up,un,8), %rax 215 jmp L(m2l1) 216 217 ALIGN(32) 218L(m2tp):add %rax, w2 C 0 219 mov (up,n,8), %rax 220 adc $0, w0 C 1 221L(m2l1):mul v0 222 add %rax, w2 C 0 223 mov (up,n,8), %rax 224 mov %rdx, w1 C 1 225 adc $0, w1 C 1 226 mul v1 227 add w3, w2 C 0 228 adc $0, w1 C 1 229 add %rax, w0 C 1 230 mov w2, (rp,n,8) C 0 231 mov 8(up,n,8), %rax 232 mov %rdx, w2 C 2 233 adc $0, w2 C 2 234L(m2l0):mul v0 235 add %rax, w0 C 1 236 mov %rdx, w3 C 2 237 adc $0, w3 C 2 238 add w1, w0 C 1 239 adc $0, w3 C 2 240 mov 8(up,n,8), %rax 241 mul v1 242 add $2, n 243 mov w0, -8(rp,n,8) C 1 244 mov %rdx, w0 C 3 245 jnc L(m2tp) 246 247L(m2ed):add %rax, w2 248 adc $0, %rdx 249 add w3, w2 250 adc $0, %rdx 251 mov w2, I((rp),(rp,n,8)) 252 mov %rdx, I(8(rp),8(rp,n,8)) 253 254 add $-2, R32(vn) 255 jz L(ret5) 256 257 lea 16(vp), vp 258 lea 16(rp), rp 259 260 261L(do_addmul): 262 push %r15 263 push vn C save vn in new stack slot 264define(`vn', `(%rsp)') 265define(`X0', `%r14') 266define(`X1', `%r15') 267define(`v1', `%r8') 268 269L(outer): 270 mov (vp), v0 271 mov 8(vp), v1 272 273 mov (up,un,8), %rax 274 mul v0 275 276 test $1, R8(un) 277 jnz L(bx1) 278 279L(bx0): mov %rax, X1 280 mov (up,un,8), %rax 281 mov %rdx, X0 282 mul v1 283 test $2, R8(un) 284 jnz L(b10) 285 286L(b00): lea (un), n C un = 4, 8, 12, ... 287 mov (rp,un,8), w3 288 mov %rax, w0 289 mov 8(up,un,8), %rax 290 mov %rdx, w1 291 jmp L(lo0) 292 293L(b10): lea 2(un), n C un = 2, 6, 10, ... 294 mov (rp,un,8), w1 295 mov %rdx, w3 296 mov %rax, w2 297 mov 8(up,un,8), %rax 298 jmp L(lo2) 299 300L(bx1): mov %rax, X0 301 mov (up,un,8), %rax 302 mov %rdx, X1 303 mul v1 304 test $2, R8(un) 305 jz L(b11) 306 307L(b01): lea 1(un), n C un = 1, 5, 9, ... 308 mov (rp,un,8), w2 309 mov %rdx, w0 310 mov %rax, w3 311 jmp L(lo1) 312 313L(b11): lea -1(un), n C un = 3, 7, 11, ... 314 mov (rp,un,8), w0 315 mov %rax, w1 316 mov 8(up,un,8), %rax 317 mov %rdx, w2 318 jmp L(lo3) 319 320 ALIGN(32) 321L(top): 322L(lo2): mul v0 323 add w1, X1 324 mov X1, -16(rp,n,8) 325 mov %rdx, X1 326 adc %rax, X0 327 adc $0, X1 328 mov -8(up,n,8), %rax 329 mul v1 330 mov -8(rp,n,8), w1 331 mov %rdx, w0 332 add w1, w2 333 adc %rax, w3 334 adc $0, w0 335L(lo1): mov (up,n,8), %rax 336 mul v0 337 add w2, X0 338 mov X0, -8(rp,n,8) 339 mov %rdx, X0 340 adc %rax, X1 341 mov (up,n,8), %rax 342 adc $0, X0 343 mov (rp,n,8), w2 344 mul v1 345 add w2, w3 346 adc %rax, w0 347 mov 8(up,n,8), %rax 348 mov %rdx, w1 349 adc $0, w1 350L(lo0): mul v0 351 add w3, X1 352 mov X1, (rp,n,8) 353 adc %rax, X0 354 mov 8(up,n,8), %rax 355 mov %rdx, X1 356 adc $0, X1 357 mov 8(rp,n,8), w3 358 mul v1 359 add w3, w0 360 adc %rax, w1 361 mov 16(up,n,8), %rax 362 mov %rdx, w2 363 adc $0, w2 364L(lo3): mul v0 365 add w0, X0 366 mov X0, 8(rp,n,8) 367 mov %rdx, X0 368 adc %rax, X1 369 adc $0, X0 370 mov 16(up,n,8), %rax 371 mov 16(rp,n,8), w0 372 mul v1 373 mov %rdx, w3 374 add w0, w1 375 adc %rax, w2 376 adc $0, w3 377 mov 24(up,n,8), %rax 378 add $4, n 379 jnc L(top) 380 381L(end): mul v0 382 add w1, X1 383 mov X1, I(-16(rp),-16(rp,n,8)) 384 mov %rdx, X1 385 adc %rax, X0 386 adc $0, X1 387 mov I(-8(up),-8(up,n,8)), %rax 388 mul v1 389 mov I(-8(rp),-8(rp,n,8)), w1 390 add w1, w2 391 adc %rax, w3 392 adc $0, %rdx 393 add w2, X0 394 adc $0, X1 395 mov X0, I(-8(rp),-8(rp,n,8)) 396 add w3, X1 397 mov X1, I((rp),(rp,n,8)) 398 adc $0, %rdx 399 mov %rdx, I(8(rp),8(rp,n,8)) 400 401 402 addl $-2, vn 403 lea 16(vp), vp 404 lea 16(rp), rp 405 jnz L(outer) 406 407 pop %rax C deallocate vn slot 408 pop %r15 409L(ret5):pop %r14 410 pop %r13 411 pop %r12 412L(ret2):pop %rbp 413 pop %rbx 414 FUNC_EXIT() 415 ret 416EPILOGUE() 417