1dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C TODO: 23C * Improve ad-hoc outer loop code and register handling. Some feed-in 24C scheduling could improve things by several cycles per outer iteration. 25C * In code for un <= 3, try keeping accumulation operands in registers, 26C without storing intermediates to rp. 27C * We might want to keep 32 in a free mm register, since the register form is 28C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save. 29C * Look into different loop alignment, we now expand the code about 50 bytes 30C with possibly needless alignment. 31C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry. 32C * Use OSP, should solve feed-in latency problems. 33C * Save a few tens of bytes by doing cross-jumping for Loel0, etc. 34C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers 35C so that they can share feed-in code, and changing the branch targets from 36C L<n> to Lm<nn>. 37 38C cycles/limb 39C P6 model 9 (Banias) ? 40C P6 model 13 (Dothan) 5.24 41C P6 model 14 (Yonah) ? 42C P4 model 0-1 (Willamette): 5 43C P4 model 2 (Northwood): 4.60 at 32 limbs 44C P4 model 3-4 (Prescott): 4.94 at 32 limbs 45 46C INPUT PARAMETERS 47C rp sp + 4 48C up sp + 8 49C un sp + 12 50C vp sp + 16 51C vn sp + 20 52 53 TEXT 54 ALIGN(16) 55PROLOGUE(mpn_mul_basecase) 56 push %esi 57 push %ebx 58 mov 12(%esp), %edx C rp 59 mov 16(%esp), %eax C up 60 mov 20(%esp), %ecx C un 61 mov 24(%esp), %esi C vp 62 mov 28(%esp), %ebx C vn 63 movd (%esi), %mm7 C 64L(ent): cmp $3, %ecx 65 ja L(big) 66 movd (%eax), %mm6 67 pmuludq %mm7, %mm6 68 jz L(un3) 69 cmp $2, %ecx 70 jz L(un2) 71 72L(un1): movd %mm6, (%edx) C un=1 73 psrlq $32, %mm6 C un=1 74 movd %mm6, 4(%edx) C un=1 75 jmp L(rtr) C un=1 76 77L(un2): movd 4(%eax), %mm1 C un=2 78 pmuludq %mm7, %mm1 C un=2 79 movd %mm6, (%edx) C un=2 80 psrlq $32, %mm6 C un=2 81 paddq %mm1, %mm6 C un=2 82 movd %mm6, 4(%edx) C un=2 83 psrlq $32, %mm6 C un=2 84 movd %mm6, 8(%edx) C un=2 85 dec %ebx C un=2 86 jz L(rtr) C un=2 87 movd 4(%esi), %mm7 C un=2 88 movd (%eax), %mm6 C un=2 89 pmuludq %mm7, %mm6 C un=2 90 movd 4(%eax), %mm1 C un=2 91 movd 4(%edx), %mm4 C un=2 92 pmuludq %mm7, %mm1 C un=2 93 movd 8(%edx), %mm5 C un=2 94 paddq %mm4, %mm6 C un=2 95 paddq %mm1, %mm5 C un=2 96 movd %mm6, 4(%edx) C un=2 97 psrlq $32, %mm6 C un=2 98 paddq %mm5, %mm6 C un=2 99 movd %mm6, 8(%edx) C un=2 100 psrlq $32, %mm6 C un=2 101 movd %mm6, 12(%edx) C un=2 102L(rtr): emms 103 pop %ebx 104 pop %esi 105 ret 106 107L(un3): movd 4(%eax), %mm1 C un=3 108 pmuludq %mm7, %mm1 C un=3 109 movd 8(%eax), %mm2 C un=3 110 pmuludq %mm7, %mm2 C un=3 111 movd %mm6, (%edx) C un=3 112 psrlq $32, %mm6 C un=3 113 paddq %mm1, %mm6 C un=3 114 movd %mm6, 4(%edx) C un=3 115 psrlq $32, %mm6 C un=3 116 paddq %mm2, %mm6 C un=3 117 movd %mm6, 8(%edx) C un=3 118 psrlq $32, %mm6 C un=3 119 movd %mm6, 12(%edx) C un=3 120 dec %ebx C un=3 121 jz L(rtr) C un=3 122 movd 4(%esi), %mm7 C un=3 123 movd (%eax), %mm6 C un=3 124 pmuludq %mm7, %mm6 C un=3 125 movd 4(%eax), %mm1 C un=3 126 movd 4(%edx), %mm4 C un=3 127 pmuludq %mm7, %mm1 C un=3 128 movd 8(%eax), %mm2 C un=3 129 movd 8(%edx), %mm5 C un=3 130 pmuludq %mm7, %mm2 C un=3 131 paddq %mm4, %mm6 C un=3 132 paddq %mm1, %mm5 C un=3 133 movd 12(%edx), %mm4 C un=3 134 movd %mm6, 4(%edx) C un=3 135 psrlq $32, %mm6 C un=3 136 paddq %mm5, %mm6 C un=3 137 paddq %mm2, %mm4 C un=3 138 movd %mm6, 8(%edx) C un=3 139 psrlq $32, %mm6 C un=3 140 paddq %mm4, %mm6 C un=3 141 movd %mm6, 12(%edx) C un=3 142 psrlq $32, %mm6 C un=3 143 movd %mm6, 16(%edx) C un=3 144 dec %ebx C un=3 145 jz L(rtr) C un=3 146 movd 8(%esi), %mm7 C un=3 147 movd (%eax), %mm6 C un=3 148 pmuludq %mm7, %mm6 C un=3 149 movd 4(%eax), %mm1 C un=3 150 movd 8(%edx), %mm4 C un=3 151 pmuludq %mm7, %mm1 C un=3 152 movd 8(%eax), %mm2 C un=3 153 movd 12(%edx), %mm5 C un=3 154 pmuludq %mm7, %mm2 C un=3 155 paddq %mm4, %mm6 C un=3 156 paddq %mm1, %mm5 C un=3 157 movd 16(%edx), %mm4 C un=3 158 movd %mm6, 8(%edx) C un=3 159 psrlq $32, %mm6 C un=3 160 paddq %mm5, %mm6 C un=3 161 paddq %mm2, %mm4 C un=3 162 movd %mm6, 12(%edx) C un=3 163 psrlq $32, %mm6 C un=3 164 paddq %mm4, %mm6 C un=3 165 movd %mm6, 16(%edx) C un=3 166 psrlq $32, %mm6 C un=3 167 movd %mm6, 20(%edx) C un=3 168 jmp L(rtr) 169 170 171L(big): push %edi 172 pxor %mm6, %mm6 173 lea 4(%esi), %esi 174 and $3, %ecx 175 jz L(0) 176 cmp $2, %ecx 177 jc L(1) 178 jz L(2) 179 jmp L(3) C FIXME: one case should fall through 180 181 182L(0): movd (%eax), %mm3 C m 0 183 sub 24(%esp), %ecx C inner loop count m 0 184 mov %ecx, 24(%esp) C update loop count for later m 0 185 pmuludq %mm7, %mm3 C m 0 186 movd 4(%eax), %mm0 C m 0 187 pmuludq %mm7, %mm0 C m 0 188 movd 8(%eax), %mm1 C m 0 189 jmp L(m00) C m 0 190 ALIGN(16) C m 0 191L(lpm0): 192 pmuludq %mm7, %mm4 C m 0 193 paddq %mm0, %mm6 C m 0 194 movd (%eax), %mm3 C m 0 195 movd %mm6, -12(%edx) C m 0 196 psrlq $32, %mm6 C m 0 197 pmuludq %mm7, %mm3 C m 0 198 paddq %mm1, %mm6 C m 0 199 movd 4(%eax), %mm0 C m 0 200 movd %mm6, -8(%edx) C m 0 201 psrlq $32, %mm6 C m 0 202 pmuludq %mm7, %mm0 C m 0 203 paddq %mm4, %mm6 C m 0 204 movd 8(%eax), %mm1 C m 0 205 movd %mm6, -4(%edx) C m 0 206 psrlq $32, %mm6 C m 0 207L(m00): pmuludq %mm7, %mm1 C m 0 208 paddq %mm3, %mm6 C m 0 209 movd 12(%eax), %mm4 C m 0 210 movd %mm6, (%edx) C m 0 211 psrlq $32, %mm6 C m 0 212 lea 16(%eax), %eax C m 0 213 lea 16(%edx), %edx C m 0 214 add $4, %ecx C m 0 215 ja L(lpm0) C m 0 216 pmuludq %mm7, %mm4 C m 0 217 paddq %mm0, %mm6 C m 0 218 movd %mm6, -12(%edx) C m 0 219 psrlq $32, %mm6 C m 0 220 paddq %mm1, %mm6 C m 0 221 mov 16(%esp), %edi C rp 0 222 jmp L(x0) 223 224L(olp0): 225 lea 4(%edi), %edi C am 0 226 movd (%esi), %mm7 C am 0 227 lea 4(%esi), %esi C am 0 228 mov %edi, %edx C rp am 0 229 mov 20(%esp), %eax C up am 0 230 movd (%eax), %mm3 C am 0 231 mov 24(%esp), %ecx C inner loop count am 0 232 pxor %mm6, %mm6 C am 0 233 pmuludq %mm7, %mm3 C am 0 234 movd 4(%eax), %mm0 C am 0 235 movd (%edx), %mm5 C am 0 236 pmuludq %mm7, %mm0 C am 0 237 movd 8(%eax), %mm1 C am 0 238 paddq %mm3, %mm5 C am 0 239 movd 4(%edx), %mm4 C am 0 240 jmp L(am00) C am 0 241 ALIGN(16) C mm 0 242L(lam0): 243 pmuludq %mm7, %mm2 C am 0 244 paddq %mm4, %mm6 C am 0 245 movd (%eax), %mm3 C am 0 246 paddq %mm1, %mm5 C am 0 247 movd -4(%edx), %mm4 C am 0 248 movd %mm6, -12(%edx) C am 0 249 psrlq $32, %mm6 C am 0 250 pmuludq %mm7, %mm3 C am 0 251 paddq %mm5, %mm6 C am 0 252 movd 4(%eax), %mm0 C am 0 253 paddq %mm2, %mm4 C am 0 254 movd (%edx), %mm5 C am 0 255 movd %mm6, -8(%edx) C am 0 256 psrlq $32, %mm6 C am 0 257 pmuludq %mm7, %mm0 C am 0 258 paddq %mm4, %mm6 C am 0 259 movd 8(%eax), %mm1 C am 0 260 paddq %mm3, %mm5 C am 0 261 movd 4(%edx), %mm4 C am 0 262 movd %mm6, -4(%edx) C am 0 263 psrlq $32, %mm6 C am 0 264L(am00): 265 pmuludq %mm7, %mm1 C am 0 266 paddq %mm5, %mm6 C am 0 267 movd 12(%eax), %mm2 C am 0 268 paddq %mm0, %mm4 C am 0 269 movd 8(%edx), %mm5 C am 0 270 movd %mm6, (%edx) C am 0 271 psrlq $32, %mm6 C am 0 272 lea 16(%eax), %eax C am 0 273 lea 16(%edx), %edx C am 0 274 add $4, %ecx C am 0 275 jnz L(lam0) C am 0 276 pmuludq %mm7, %mm2 C am 0 277 paddq %mm4, %mm6 C am 0 278 paddq %mm1, %mm5 C am 0 279 movd -4(%edx), %mm4 C am 0 280 movd %mm6, -12(%edx) C am 0 281 psrlq $32, %mm6 C am 0 282 paddq %mm5, %mm6 C am 0 283 paddq %mm2, %mm4 C am 0 284L(x0): movd %mm6, -8(%edx) C am 0 285 psrlq $32, %mm6 C am 0 286 paddq %mm4, %mm6 C am 0 287 movd %mm6, -4(%edx) C am 0 288 psrlq $32, %mm6 C am 0 289 movd %mm6, (%edx) C am 0 290 dec %ebx C am 0 291 jnz L(olp0) C am 0 292L(oel0): 293 emms C 0 294 pop %edi C 0 295 pop %ebx C 0 296 pop %esi C 0 297 ret C 0 298 299 300L(1): movd (%eax), %mm4 C m 1 301 sub 24(%esp), %ecx C m 1 302 mov %ecx, 24(%esp) C update loop count for later m 1 303 pmuludq %mm7, %mm4 C m 1 304 movd 4(%eax), %mm3 C m 1 305 pmuludq %mm7, %mm3 C m 1 306 movd 8(%eax), %mm0 C m 1 307 jmp L(m01) C m 1 308 ALIGN(16) C m 1 309L(lpm1): 310 pmuludq %mm7, %mm4 C m 1 311 paddq %mm0, %mm6 C m 1 312 movd 4(%eax), %mm3 C m 1 313 movd %mm6, -8(%edx) C m 1 314 psrlq $32, %mm6 C m 1 315 pmuludq %mm7, %mm3 C m 1 316 paddq %mm1, %mm6 C m 1 317 movd 8(%eax), %mm0 C m 1 318 movd %mm6, -4(%edx) C m 1 319 psrlq $32, %mm6 C m 1 320L(m01): pmuludq %mm7, %mm0 C m 1 321 paddq %mm4, %mm6 C m 1 322 movd 12(%eax), %mm1 C m 1 323 movd %mm6, (%edx) C m 1 324 psrlq $32, %mm6 C m 1 325 pmuludq %mm7, %mm1 C m 1 326 paddq %mm3, %mm6 C m 1 327 movd 16(%eax), %mm4 C m 1 328 movd %mm6, 4(%edx) C m 1 329 psrlq $32, %mm6 C m 1 330 lea 16(%eax), %eax C m 1 331 lea 16(%edx), %edx C m 1 332 add $4, %ecx C m 1 333 ja L(lpm1) C m 1 334 pmuludq %mm7, %mm4 C m 1 335 paddq %mm0, %mm6 C m 1 336 movd %mm6, -8(%edx) C m 1 337 psrlq $32, %mm6 C m 1 338 paddq %mm1, %mm6 C m 1 339 mov 16(%esp), %edi C rp 1 340 jmp L(x1) 341 342L(olp1): 343 lea 4(%edi), %edi C am 1 344 movd (%esi), %mm7 C am 1 345 lea 4(%esi), %esi C am 1 346 mov %edi, %edx C rp am 1 347 mov 20(%esp), %eax C up am 1 348 movd (%eax), %mm2 C am 1 349 mov 24(%esp), %ecx C inner loop count am 1 350 pxor %mm6, %mm6 C am 1 351 pmuludq %mm7, %mm2 C am 1 352 movd 4(%eax), %mm3 C am 1 353 movd (%edx), %mm4 C am 1 354 pmuludq %mm7, %mm3 C am 1 355 movd 8(%eax), %mm0 C am 1 356 paddq %mm2, %mm4 C am 1 357 movd 4(%edx), %mm5 C am 1 358 jmp L(am01) C am 1 359 ALIGN(16) C am 1 360L(lam1): 361 pmuludq %mm7, %mm2 C am 1 362 paddq %mm4, %mm6 C am 1 363 movd 4(%eax), %mm3 C am 1 364 paddq %mm1, %mm5 C am 1 365 movd (%edx), %mm4 C am 1 366 movd %mm6, -8(%edx) C am 1 367 psrlq $32, %mm6 C am 1 368 pmuludq %mm7, %mm3 C am 1 369 paddq %mm5, %mm6 C am 1 370 movd 8(%eax), %mm0 C am 1 371 paddq %mm2, %mm4 C am 1 372 movd 4(%edx), %mm5 C am 1 373 movd %mm6, -4(%edx) C am 1 374 psrlq $32, %mm6 C am 1 375L(am01): 376 pmuludq %mm7, %mm0 C am 1 377 paddq %mm4, %mm6 C am 1 378 movd 12(%eax), %mm1 C am 1 379 paddq %mm3, %mm5 C am 1 380 movd 8(%edx), %mm4 C am 1 381 movd %mm6, (%edx) C am 1 382 psrlq $32, %mm6 C am 1 383 pmuludq %mm7, %mm1 C am 1 384 paddq %mm5, %mm6 C am 1 385 movd 16(%eax), %mm2 C am 1 386 paddq %mm0, %mm4 C am 1 387 movd 12(%edx), %mm5 C am 1 388 movd %mm6, 4(%edx) C am 1 389 psrlq $32, %mm6 C am 1 390 lea 16(%eax), %eax C am 1 391 lea 16(%edx), %edx C am 1 392 add $4, %ecx C am 1 393 jnz L(lam1) C am 1 394 pmuludq %mm7, %mm2 C am 1 395 paddq %mm4, %mm6 C am 1 396 paddq %mm1, %mm5 C am 1 397 movd (%edx), %mm4 C am 1 398 movd %mm6, -8(%edx) C am 1 399 psrlq $32, %mm6 C am 1 400 paddq %mm5, %mm6 C am 1 401 paddq %mm2, %mm4 C am 1 402L(x1): movd %mm6, -4(%edx) C am 1 403 psrlq $32, %mm6 C am 1 404 paddq %mm4, %mm6 C am 1 405 movd %mm6, (%edx) C am 1 406 psrlq $32, %mm6 C am 1 407 movd %mm6, 4(%edx) C am 1 408 dec %ebx C am 1 409 jnz L(olp1) C am 1 410L(oel1): 411 emms C 1 412 pop %edi C 1 413 pop %ebx C 1 414 pop %esi C 1 415 ret C 1 416 417 418L(2): movd (%eax), %mm1 C m 2 419 sub 24(%esp), %ecx C m 2 420 mov %ecx, 24(%esp) C update loop count for later m 2 421 pmuludq %mm7, %mm1 C m 2 422 movd 4(%eax), %mm4 C m 2 423 pmuludq %mm7, %mm4 C m 2 424 movd 8(%eax), %mm3 C m 2 425 jmp L(m10) C m 2 426 ALIGN(16) C m 2 427L(lpm2): 428 pmuludq %mm7, %mm4 C m 2 429 paddq %mm0, %mm6 C m 2 430 movd 8(%eax), %mm3 C m 2 431 movd %mm6, -4(%edx) C m 2 432 psrlq $32, %mm6 C m 2 433L(m10): pmuludq %mm7, %mm3 C m 2 434 paddq %mm1, %mm6 C m 2 435 movd 12(%eax), %mm0 C m 2 436 movd %mm6, (%edx) C m 2 437 psrlq $32, %mm6 C m 2 438 pmuludq %mm7, %mm0 C m 2 439 paddq %mm4, %mm6 C m 2 440 movd 16(%eax), %mm1 C m 2 441 movd %mm6, 4(%edx) C m 2 442 psrlq $32, %mm6 C m 2 443 pmuludq %mm7, %mm1 C m 2 444 paddq %mm3, %mm6 C m 2 445 movd 20(%eax), %mm4 C m 2 446 movd %mm6, 8(%edx) C m 2 447 psrlq $32, %mm6 C m 2 448 lea 16(%eax), %eax C m 2 449 lea 16(%edx), %edx C m 2 450 add $4, %ecx C m 2 451 ja L(lpm2) C m 2 452 pmuludq %mm7, %mm4 C m 2 453 paddq %mm0, %mm6 C m 2 454 movd %mm6, -4(%edx) C m 2 455 psrlq $32, %mm6 C m 2 456 paddq %mm1, %mm6 C m 2 457 mov 16(%esp), %edi C rp 2 458 jmp L(x2) 459 460L(olp2): 461 lea 4(%edi), %edi C am 2 462 movd (%esi), %mm7 C am 2 463 lea 4(%esi), %esi C am 2 464 mov %edi, %edx C rp am 2 465 mov 20(%esp), %eax C up am 2 466 movd (%eax), %mm1 C am 2 467 mov 24(%esp), %ecx C inner loop count am 2 468 pxor %mm6, %mm6 C am 2 469 pmuludq %mm7, %mm1 C am 2 470 movd 4(%eax), %mm2 C am 2 471 movd (%edx), %mm5 C am 2 472 pmuludq %mm7, %mm2 C am 2 473 movd 8(%eax), %mm3 C am 2 474 paddq %mm1, %mm5 C am 2 475 movd 4(%edx), %mm4 C am 2 476 jmp L(am10) C am 2 477 ALIGN(16) C am 2 478L(lam2): 479 pmuludq %mm7, %mm2 C am 2 480 paddq %mm4, %mm6 C am 2 481 movd 8(%eax), %mm3 C am 2 482 paddq %mm1, %mm5 C am 2 483 movd 4(%edx), %mm4 C am 2 484 movd %mm6, -4(%edx) C am 2 485 psrlq $32, %mm6 C am 2 486L(am10): 487 pmuludq %mm7, %mm3 C am 2 488 paddq %mm5, %mm6 C am 2 489 movd 12(%eax), %mm0 C am 2 490 paddq %mm2, %mm4 C am 2 491 movd 8(%edx), %mm5 C am 2 492 movd %mm6, (%edx) C am 2 493 psrlq $32, %mm6 C am 2 494 pmuludq %mm7, %mm0 C am 2 495 paddq %mm4, %mm6 C am 2 496 movd 16(%eax), %mm1 C am 2 497 paddq %mm3, %mm5 C am 2 498 movd 12(%edx), %mm4 C am 2 499 movd %mm6, 4(%edx) C am 2 500 psrlq $32, %mm6 C am 2 501 pmuludq %mm7, %mm1 C am 2 502 paddq %mm5, %mm6 C am 2 503 movd 20(%eax), %mm2 C am 2 504 paddq %mm0, %mm4 C am 2 505 movd 16(%edx), %mm5 C am 2 506 movd %mm6, 8(%edx) C am 2 507 psrlq $32, %mm6 C am 2 508 lea 16(%eax), %eax C am 2 509 lea 16(%edx), %edx C am 2 510 add $4, %ecx C am 2 511 jnz L(lam2) C am 2 512 pmuludq %mm7, %mm2 C am 2 513 paddq %mm4, %mm6 C am 2 514 paddq %mm1, %mm5 C am 2 515 movd 4(%edx), %mm4 C am 2 516 movd %mm6, -4(%edx) C am 2 517 psrlq $32, %mm6 C am 2 518 paddq %mm5, %mm6 C am 2 519 paddq %mm2, %mm4 C am 2 520L(x2): movd %mm6, (%edx) C am 2 521 psrlq $32, %mm6 C am 2 522 paddq %mm4, %mm6 C am 2 523 movd %mm6, 4(%edx) C am 2 524 psrlq $32, %mm6 C am 2 525 movd %mm6, 8(%edx) C am 2 526 dec %ebx C am 2 527 jnz L(olp2) C am 2 528L(oel2): 529 emms C 2 530 pop %edi C 2 531 pop %ebx C 2 532 pop %esi C 2 533 ret C 2 534 535 536L(3): movd (%eax), %mm0 C m 3 537 sub 24(%esp), %ecx C m 3 538 mov %ecx, 24(%esp) C update loop count for later m 3 539 pmuludq %mm7, %mm0 C m 3 540 movd 4(%eax), %mm1 C m 3 541 pmuludq %mm7, %mm1 C m 3 542 movd 8(%eax), %mm4 C m 3 543 jmp L(lpm3) C m 3 544 ALIGN(16) C m 3 545L(lpm3): 546 pmuludq %mm7, %mm4 C m 3 547 paddq %mm0, %mm6 C m 3 548 movd 12(%eax), %mm3 C m 3 549 movd %mm6, (%edx) C m 3 550 psrlq $32, %mm6 C m 3 551 pmuludq %mm7, %mm3 C m 3 552 paddq %mm1, %mm6 C m 3 553 movd 16(%eax), %mm0 C m 3 554 movd %mm6, 4(%edx) C m 3 555 psrlq $32, %mm6 C m 3 556 pmuludq %mm7, %mm0 C m 3 557 paddq %mm4, %mm6 C m 3 558 movd 20(%eax), %mm1 C m 3 559 movd %mm6, 8(%edx) C m 3 560 psrlq $32, %mm6 C m 3 561 pmuludq %mm7, %mm1 C m 3 562 paddq %mm3, %mm6 C m 3 563 movd 24(%eax), %mm4 C m 3 564 movd %mm6, 12(%edx) C m 3 565 psrlq $32, %mm6 C m 3 566 lea 16(%eax), %eax C m 3 567 lea 16(%edx), %edx C m 3 568 add $4, %ecx C m 3 569 ja L(lpm3) C m 3 570 pmuludq %mm7, %mm4 C m 3 571 paddq %mm0, %mm6 C m 3 572 movd %mm6, (%edx) C m 3 573 psrlq $32, %mm6 C m 3 574 paddq %mm1, %mm6 C m 3 575 mov 16(%esp), %edi C rp 3 576 jmp L(x3) 577 578L(olp3): 579 lea 4(%edi), %edi C am 3 580 movd (%esi), %mm7 C am 3 581 lea 4(%esi), %esi C am 3 582 mov %edi, %edx C rp am 3 583 mov 20(%esp), %eax C up am 3 584 movd (%eax), %mm0 C am 3 585 mov 24(%esp), %ecx C inner loop count am 3 586 pxor %mm6, %mm6 C am 3 587 pmuludq %mm7, %mm0 C am 3 588 movd 4(%eax), %mm1 C am 3 589 movd (%edx), %mm4 C am 3 590 pmuludq %mm7, %mm1 C am 3 591 movd 8(%eax), %mm2 C am 3 592 paddq %mm0, %mm4 C am 3 593 movd 4(%edx), %mm5 C am 3 594 jmp L(lam3) C am 3 595 ALIGN(16) C am 3 596L(lam3): 597 pmuludq %mm7, %mm2 C am 3 598 paddq %mm4, %mm6 C am 3 599 movd 12(%eax), %mm3 C am 3 600 paddq %mm1, %mm5 C am 3 601 movd 8(%edx), %mm4 C am 3 602 movd %mm6, (%edx) C am 3 603 psrlq $32, %mm6 C am 3 604 pmuludq %mm7, %mm3 C am 3 605 paddq %mm5, %mm6 C am 3 606 movd 16(%eax), %mm0 C am 3 607 paddq %mm2, %mm4 C am 3 608 movd 12(%edx), %mm5 C am 3 609 movd %mm6, 4(%edx) C am 3 610 psrlq $32, %mm6 C am 3 611 pmuludq %mm7, %mm0 C am 3 612 paddq %mm4, %mm6 C am 3 613 movd 20(%eax), %mm1 C am 3 614 paddq %mm3, %mm5 C am 3 615 movd 16(%edx), %mm4 C am 3 616 movd %mm6, 8(%edx) C am 3 617 psrlq $32, %mm6 C am 3 618 pmuludq %mm7, %mm1 C am 3 619 paddq %mm5, %mm6 C am 3 620 movd 24(%eax), %mm2 C am 3 621 paddq %mm0, %mm4 C am 3 622 movd 20(%edx), %mm5 C am 3 623 movd %mm6, 12(%edx) C am 3 624 psrlq $32, %mm6 C am 3 625 lea 16(%eax), %eax C am 3 626 lea 16(%edx), %edx C am 3 627 add $4, %ecx C am 3 628 jnz L(lam3) C am 3 629 pmuludq %mm7, %mm2 C am 3 630 paddq %mm4, %mm6 C am 3 631 paddq %mm1, %mm5 C am 3 632 movd 8(%edx), %mm4 C am 3 633 movd %mm6, (%edx) C am 3 634 psrlq $32, %mm6 C am 3 635 paddq %mm5, %mm6 C am 3 636 paddq %mm2, %mm4 C am 3 637L(x3): movd %mm6, 4(%edx) C am 3 638 psrlq $32, %mm6 C am 3 639 paddq %mm4, %mm6 C am 3 640 movd %mm6, 8(%edx) C am 3 641 psrlq $32, %mm6 C am 3 642 movd %mm6, 12(%edx) C am 3 643 dec %ebx C am 3 644 jnz L(olp3) C am 3 645L(oel3): 646 emms C 3 647 pop %edi C 3 648 pop %ebx C 3 649 pop %esi C 3 650 ret C 3 651EPILOGUE() 652