1dnl AMD64 mpn_mullo_basecase. 2 3dnl Contributed to the GNU project by Torbjorn Granlund. 4 5dnl Copyright 2008, 2009, 2011, 2012 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C The inner loops of this code are the result of running a code generation and 36C optimisation tool suite written by David Harvey and Torbjorn Granlund. 37 38C NOTES 39C * There is a major stupidity in that we call mpn_mul_1 initially, for a 40C large trip count. Instead, we should start with mul_2 for any operand 41C size congruence class. 42C * Stop iterating addmul_2 earlier, falling into straight-line triangle code 43C for the last 2-3 iterations. 44C * Perhaps implement n=4 special code. 45C * The reload of the outer loop jump address hurts branch prediction. 46C * The addmul_2 loop ends with an MUL whose high part is not used upon loop 47C exit. 48 49C INPUT PARAMETERS 50define(`rp', `%rdi') 51define(`up', `%rsi') 52define(`vp_param', `%rdx') 53define(`n', `%rcx') 54 55define(`vp', `%r11') 56define(`outer_addr', `%r8') 57define(`j', `%r9') 58define(`v0', `%r13') 59define(`v1', `%r14') 60define(`w0', `%rbx') 61define(`w1', `%r15') 62define(`w2', `%rbp') 63define(`w3', `%r10') 64 65ABI_SUPPORT(DOS64) 66ABI_SUPPORT(STD64) 67 68ASM_START() 69 TEXT 70 ALIGN(16) 71PROLOGUE(mpn_mullo_basecase) 72 FUNC_ENTRY(4) 73 cmp $4, n 74 jge L(gen) 75 mov (up), %rax C u0 76 mov (vp_param), %r8 C v0 77 78 lea L(tab)(%rip), %r9 79ifdef(`PIC', 80` movslq (%r9,%rcx,4), %r10 81 add %r10, %r9 82 jmp *%r9 83',` 84 jmp *(%r9,n,8) 85') 86 JUMPTABSECT 87 ALIGN(8) 88L(tab): JMPENT( L(tab), L(tab)) C not allowed 89 JMPENT( L(1), L(tab)) C 1 90 JMPENT( L(2), L(tab)) C 2 91 JMPENT( L(3), L(tab)) C 3 92dnl JMPENT( L(0m4), L(tab)) C 4 93dnl JMPENT( L(1m4), L(tab)) C 5 94dnl JMPENT( L(2m4), L(tab)) C 6 95dnl JMPENT( L(3m4), L(tab)) C 7 96dnl JMPENT( L(0m4), L(tab)) C 8 97dnl JMPENT( L(1m4), L(tab)) C 9 98dnl JMPENT( L(2m4), L(tab)) C 10 99dnl JMPENT( L(3m4), L(tab)) C 11 100 TEXT 101 102L(1): imul %r8, %rax 103 mov %rax, (rp) 104 FUNC_EXIT() 105 ret 106 107L(2): mov 8(vp_param), %r11 108 imul %rax, %r11 C u0 x v1 109 mul %r8 C u0 x v0 110 mov %rax, (rp) 111 imul 8(up), %r8 C u1 x v0 112 lea (%r11, %rdx), %rax 113 add %r8, %rax 114 mov %rax, 8(rp) 115 FUNC_EXIT() 116 ret 117 118L(3): mov 8(vp_param), %r9 C v1 119 mov 16(vp_param), %r11 120 mul %r8 C u0 x v0 -> <r1,r0> 121 mov %rax, (rp) C r0 122 mov (up), %rax C u0 123 mov %rdx, %rcx C r1 124 mul %r9 C u0 x v1 -> <r2,r1> 125 imul 8(up), %r9 C u1 x v1 -> r2 126 mov 16(up), %r10 127 imul %r8, %r10 C u2 x v0 -> r2 128 add %rax, %rcx 129 adc %rdx, %r9 130 add %r10, %r9 131 mov 8(up), %rax C u1 132 mul %r8 C u1 x v0 -> <r2,r1> 133 add %rax, %rcx 134 adc %rdx, %r9 135 mov %r11, %rax 136 imul (up), %rax C u0 x v2 -> r2 137 add %rax, %r9 138 mov %rcx, 8(rp) 139 mov %r9, 16(rp) 140 FUNC_EXIT() 141 ret 142 143L(0m4): 144L(1m4): 145L(2m4): 146L(3m4): 147L(gen): push %rbx 148 push %rbp 149 push %r13 150 push %r14 151 push %r15 152 153 mov (up), %rax 154 mov (vp_param), v0 155 mov vp_param, vp 156 157 lea (rp,n,8), rp 158 lea (up,n,8), up 159 neg n 160 161 mul v0 162 163 test $1, R8(n) 164 jz L(mul_2) 165 166L(mul_1): 167 lea -8(rp), rp 168 lea -8(up), up 169 test $2, R8(n) 170 jnz L(mul_1_prologue_3) 171 172L(mul_1_prologue_2): C n = 7, 11, 15, ... 173 lea -1(n), j 174 lea L(addmul_outer_1)(%rip), outer_addr 175 mov %rax, w0 176 mov %rdx, w1 177 xor R32(w2), R32(w2) 178 xor R32(w3), R32(w3) 179 mov 16(up,n,8), %rax 180 jmp L(mul_1_entry_2) 181 182L(mul_1_prologue_3): C n = 5, 9, 13, ... 183 lea 1(n), j 184 lea L(addmul_outer_3)(%rip), outer_addr 185 mov %rax, w2 186 mov %rdx, w3 187 xor R32(w0), R32(w0) 188 jmp L(mul_1_entry_0) 189 190 ALIGN(16) 191L(mul_1_top): 192 mov w0, -16(rp,j,8) 193 add %rax, w1 194 mov (up,j,8), %rax 195 adc %rdx, w2 196 xor R32(w0), R32(w0) 197 mul v0 198 mov w1, -8(rp,j,8) 199 add %rax, w2 200 adc %rdx, w3 201L(mul_1_entry_0): 202 mov 8(up,j,8), %rax 203 mul v0 204 mov w2, (rp,j,8) 205 add %rax, w3 206 adc %rdx, w0 207 mov 16(up,j,8), %rax 208 mul v0 209 mov w3, 8(rp,j,8) 210 xor R32(w2), R32(w2) C zero 211 mov w2, w3 C zero 212 add %rax, w0 213 mov 24(up,j,8), %rax 214 mov w2, w1 C zero 215 adc %rdx, w1 216L(mul_1_entry_2): 217 mul v0 218 add $4, j 219 js L(mul_1_top) 220 221 mov w0, -16(rp) 222 add %rax, w1 223 mov w1, -8(rp) 224 adc %rdx, w2 225 226 imul (up), v0 227 add v0, w2 228 mov w2, (rp) 229 230 add $1, n 231 jz L(ret) 232 233 mov 8(vp), v0 234 mov 16(vp), v1 235 236 lea 16(up), up 237 lea 8(vp), vp 238 lea 24(rp), rp 239 240 jmp *outer_addr 241 242 243L(mul_2): 244 mov 8(vp), v1 245 test $2, R8(n) 246 jz L(mul_2_prologue_3) 247 248 ALIGN(16) 249L(mul_2_prologue_1): 250 lea 0(n), j 251 mov %rax, w3 252 mov %rdx, w0 253 xor R32(w1), R32(w1) 254 mov (up,n,8), %rax 255 lea L(addmul_outer_3)(%rip), outer_addr 256 jmp L(mul_2_entry_1) 257 258 ALIGN(16) 259L(mul_2_prologue_3): 260 lea 2(n), j 261 mov $0, R32(w3) 262 mov %rax, w1 263 mov (up,n,8), %rax 264 mov %rdx, w2 265 lea L(addmul_outer_1)(%rip), outer_addr 266 jmp L(mul_2_entry_3) 267 268 ALIGN(16) 269L(mul_2_top): 270 mov -32(up,j,8), %rax 271 mul v1 272 add %rax, w0 273 adc %rdx, w1 274 mov -24(up,j,8), %rax 275 xor R32(w2), R32(w2) 276 mul v0 277 add %rax, w0 278 mov -24(up,j,8), %rax 279 adc %rdx, w1 280 adc $0, R32(w2) 281 mul v1 282 add %rax, w1 283 mov w0, -24(rp,j,8) 284 adc %rdx, w2 285 mov -16(up,j,8), %rax 286 mul v0 287 mov $0, R32(w3) 288 add %rax, w1 289 adc %rdx, w2 290 mov -16(up,j,8), %rax 291 adc $0, R32(w3) 292L(mul_2_entry_3): 293 mov $0, R32(w0) 294 mov w1, -16(rp,j,8) 295 mul v1 296 add %rax, w2 297 mov -8(up,j,8), %rax 298 adc %rdx, w3 299 mov $0, R32(w1) 300 mul v0 301 add %rax, w2 302 mov -8(up,j,8), %rax 303 adc %rdx, w3 304 adc R32(w1), R32(w0) 305 mul v1 306 add %rax, w3 307 mov w2, -8(rp,j,8) 308 adc %rdx, w0 309 mov (up,j,8), %rax 310 mul v0 311 add %rax, w3 312 adc %rdx, w0 313 adc $0, R32(w1) 314L(mul_2_entry_1): 315 add $4, j 316 mov w3, -32(rp,j,8) 317 js L(mul_2_top) 318 319 imul -16(up), v1 320 add v1, w0 321 imul -8(up), v0 322 add v0, w0 323 mov w0, -8(rp) 324 325 add $2, n 326 jz L(ret) 327 328 mov 16(vp), v0 329 mov 24(vp), v1 330 331 lea 16(vp), vp 332 lea 16(rp), rp 333 334 jmp *outer_addr 335 336 337L(addmul_outer_1): 338 lea -2(n), j 339 mov -16(up,n,8), %rax 340 mul v0 341 mov %rax, w3 342 mov -16(up,n,8), %rax 343 mov %rdx, w0 344 xor R32(w1), R32(w1) 345 lea L(addmul_outer_3)(%rip), outer_addr 346 jmp L(addmul_entry_1) 347 348L(addmul_outer_3): 349 lea 0(n), j 350 mov -16(up,n,8), %rax 351 xor R32(w3), R32(w3) 352 mul v0 353 mov %rax, w1 354 mov -16(up,n,8), %rax 355 mov %rdx, w2 356 lea L(addmul_outer_1)(%rip), outer_addr 357 jmp L(addmul_entry_3) 358 359 ALIGN(16) 360L(addmul_top): 361 add w3, -32(rp,j,8) 362 adc %rax, w0 363 mov -24(up,j,8), %rax 364 adc %rdx, w1 365 xor R32(w2), R32(w2) 366 mul v0 367 add %rax, w0 368 mov -24(up,j,8), %rax 369 adc %rdx, w1 370 adc R32(w2), R32(w2) 371 mul v1 372 xor R32(w3), R32(w3) 373 add w0, -24(rp,j,8) 374 adc %rax, w1 375 mov -16(up,j,8), %rax 376 adc %rdx, w2 377 mul v0 378 add %rax, w1 379 mov -16(up,j,8), %rax 380 adc %rdx, w2 381 adc $0, R32(w3) 382L(addmul_entry_3): 383 mul v1 384 add w1, -16(rp,j,8) 385 adc %rax, w2 386 mov -8(up,j,8), %rax 387 adc %rdx, w3 388 mul v0 389 xor R32(w0), R32(w0) 390 add %rax, w2 391 adc %rdx, w3 392 mov $0, R32(w1) 393 mov -8(up,j,8), %rax 394 adc R32(w1), R32(w0) 395 mul v1 396 add w2, -8(rp,j,8) 397 adc %rax, w3 398 adc %rdx, w0 399 mov (up,j,8), %rax 400 mul v0 401 add %rax, w3 402 mov (up,j,8), %rax 403 adc %rdx, w0 404 adc $0, R32(w1) 405L(addmul_entry_1): 406 mul v1 407 add $4, j 408 js L(addmul_top) 409 410 add w3, -32(rp) 411 adc %rax, w0 412 413 imul -24(up), v0 414 add v0, w0 415 add w0, -24(rp) 416 417 add $2, n 418 jns L(ret) 419 420 lea 16(vp), vp 421 422 mov (vp), v0 423 mov 8(vp), v1 424 425 lea -16(up), up 426 427 jmp *outer_addr 428 429L(ret): pop %r15 430 pop %r14 431 pop %r13 432 pop %rbp 433 pop %rbx 434 FUNC_EXIT() 435 ret 436EPILOGUE() 437