1dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23C cycles/limb 24C P6 model 0-8,10-12 - 25C P6 model 9 (Banias) 4.17 26C P6 model 13 (Dothan) 4.17 27C P4 model 0-1 (Willamette) 4 28C P4 model 2 (Northwood) 4 29C P4 model 3-4 (Prescott) 4.55 30 31C TODO: 32C * Tweak eax/edx offsets in loop as to save some lea's 33C * Perhaps software pipeline small-case code 34 35C INPUT PARAMETERS 36C rp sp + 4 37C up sp + 8 38C n sp + 12 39C v0 sp + 16 40 41 TEXT 42 ALIGN(16) 43PROLOGUE(mpn_mul_1) 44 pxor %mm6, %mm6 45L(ent): mov 4(%esp), %edx 46 mov 8(%esp), %eax 47 mov 12(%esp), %ecx 48 movd 16(%esp), %mm7 49 cmp $4, %ecx 50 jnc L(big) 51 52L(lp0): movd (%eax), %mm0 53 lea 4(%eax), %eax 54 lea 4(%edx), %edx 55 pmuludq %mm7, %mm0 56 paddq %mm0, %mm6 57 movd %mm6, -4(%edx) 58 psrlq $32, %mm6 59 dec %ecx 60 jnz L(lp0) 61 movd %mm6, %eax 62 emms 63 ret 64 65L(big): and $3, %ecx 66 je L(0) 67 cmp $2, %ecx 68 jc L(1) 69 je L(2) 70 jmp L(3) C FIXME: one case should fall through 71 72L(0): movd (%eax), %mm3 73 sub 12(%esp), %ecx C loop count 74 lea -16(%eax), %eax 75 lea -12(%edx), %edx 76 pmuludq %mm7, %mm3 77 movd 20(%eax), %mm0 78 pmuludq %mm7, %mm0 79 movd 24(%eax), %mm1 80 jmp L(00) 81 82L(1): movd (%eax), %mm2 83 sub 12(%esp), %ecx 84 lea -12(%eax), %eax 85 lea -8(%edx), %edx 86 pmuludq %mm7, %mm2 87 movd 16(%eax), %mm3 88 pmuludq %mm7, %mm3 89 movd 20(%eax), %mm0 90 jmp L(01) 91 92L(2): movd (%eax), %mm1 93 sub 12(%esp), %ecx 94 lea -8(%eax), %eax 95 lea -4(%edx), %edx 96 pmuludq %mm7, %mm1 97 movd 12(%eax), %mm2 98 pmuludq %mm7, %mm2 99 movd 16(%eax), %mm3 100 jmp L(10) 101 102L(3): movd (%eax), %mm0 103 sub 12(%esp), %ecx 104 lea -4(%eax), %eax 105 pmuludq %mm7, %mm0 106 movd 8(%eax), %mm1 107 pmuludq %mm7, %mm1 108 movd 12(%eax), %mm2 109 110 ALIGN(16) 111L(top): pmuludq %mm7, %mm2 112 paddq %mm0, %mm6 113 movd 16(%eax), %mm3 114 movd %mm6, 0(%edx) 115 psrlq $32, %mm6 116L(10): pmuludq %mm7, %mm3 117 paddq %mm1, %mm6 118 movd 20(%eax), %mm0 119 movd %mm6, 4(%edx) 120 psrlq $32, %mm6 121L(01): pmuludq %mm7, %mm0 122 paddq %mm2, %mm6 123 movd 24(%eax), %mm1 124 movd %mm6, 8(%edx) 125 psrlq $32, %mm6 126L(00): pmuludq %mm7, %mm1 127 paddq %mm3, %mm6 128 movd 28(%eax), %mm2 129 movd %mm6, 12(%edx) 130 psrlq $32, %mm6 131 lea 16(%eax), %eax 132 lea 16(%edx), %edx 133 add $4, %ecx 134 ja L(top) 135 136L(end): pmuludq %mm7, %mm2 137 paddq %mm0, %mm6 138 movd %mm6, 0(%edx) 139 psrlq $32, %mm6 140 paddq %mm1, %mm6 141 movd %mm6, 4(%edx) 142 psrlq $32, %mm6 143 paddq %mm2, %mm6 144 movd %mm6, 8(%edx) 145 psrlq $32, %mm6 146 movd %mm6, %eax 147 emms 148 ret 149EPILOGUE() 150PROLOGUE(mpn_mul_1c) 151 movd 20(%esp), %mm6 152 jmp L(ent) 153EPILOGUE() 154