1dnl mpn_mul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2005, 2007 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23C TODO: 24C * Tweak eax/edx offsets in loop as to save some lea's 25C * Perhaps software pipeline small-case code 26 27C cycles/limb 28C P6 model 0-8,10-12) - 29C P6 model 9 (Banias) ? 30C P6 model 13 (Dothan) 4.17 31C P4 model 0-1 (Willamette): 4 32C P4 model 2 (Northwood): 4 33C P4 model 3-4 (Prescott): 4.55 34 35C INPUT PARAMETERS 36C rp sp + 4 37C up sp + 8 38C n sp + 12 39C v0 sp + 16 40 41 TEXT 42 ALIGN(16) 43PROLOGUE(mpn_mul_1c) 44 mov 4(%esp), %edx 45 mov 8(%esp), %eax 46 mov 12(%esp), %ecx 47 movd 16(%esp), %mm7 48 movd 20(%esp), %mm6 49 jmp L(ent) 50EPILOGUE() 51 ALIGN(16) 52PROLOGUE(mpn_mul_1) 53 mov 4(%esp), %edx 54 mov 8(%esp), %eax 55 mov 12(%esp), %ecx 56 movd 16(%esp), %mm7 57 pxor %mm6, %mm6 58L(ent): cmp $4, %ecx 59 jnc L(big) 60 61L(lp0): movd (%eax), %mm0 62 lea 4(%eax), %eax 63 lea 4(%edx), %edx 64 pmuludq %mm7, %mm0 65 paddq %mm0, %mm6 66 movd %mm6, -4(%edx) 67 psrlq $32, %mm6 68 dec %ecx 69 jnz L(lp0) 70 movd %mm6, %eax 71 emms 72 ret 73 74L(big): and $3, %ecx 75 je L(0) 76 cmp $2, %ecx 77 jc L(1) 78 je L(2) 79 jmp L(3) C FIXME: one case should fall through 80 81L(0): movd (%eax), %mm3 82 sub 12(%esp), %ecx C loop count 83 lea -16(%eax), %eax 84 lea -12(%edx), %edx 85 pmuludq %mm7, %mm3 86 movd 20(%eax), %mm0 87 pmuludq %mm7, %mm0 88 movd 24(%eax), %mm1 89 jmp L(00) 90 91L(1): movd (%eax), %mm2 92 sub 12(%esp), %ecx 93 lea -12(%eax), %eax 94 lea -8(%edx), %edx 95 pmuludq %mm7, %mm2 96 movd 16(%eax), %mm3 97 pmuludq %mm7, %mm3 98 movd 20(%eax), %mm0 99 jmp L(01) 100 101L(2): movd (%eax), %mm1 102 sub 12(%esp), %ecx 103 lea -8(%eax), %eax 104 lea -4(%edx), %edx 105 pmuludq %mm7, %mm1 106 movd 12(%eax), %mm2 107 pmuludq %mm7, %mm2 108 movd 16(%eax), %mm3 109 jmp L(10) 110 111L(3): movd (%eax), %mm0 112 sub 12(%esp), %ecx 113 lea -4(%eax), %eax 114 pmuludq %mm7, %mm0 115 movd 8(%eax), %mm1 116 pmuludq %mm7, %mm1 117 movd 12(%eax), %mm2 118 119 ALIGN(16) 120L(top): pmuludq %mm7, %mm2 121 paddq %mm0, %mm6 122 movd 16(%eax), %mm3 123 movd %mm6, 0(%edx) 124 psrlq $32, %mm6 125L(10): pmuludq %mm7, %mm3 126 paddq %mm1, %mm6 127 movd 20(%eax), %mm0 128 movd %mm6, 4(%edx) 129 psrlq $32, %mm6 130L(01): pmuludq %mm7, %mm0 131 paddq %mm2, %mm6 132 movd 24(%eax), %mm1 133 movd %mm6, 8(%edx) 134 psrlq $32, %mm6 135L(00): pmuludq %mm7, %mm1 136 paddq %mm3, %mm6 137 movd 28(%eax), %mm2 138 movd %mm6, 12(%edx) 139 psrlq $32, %mm6 140 lea 16(%eax), %eax 141 lea 16(%edx), %edx 142 add $4, %ecx 143 ja L(top) 144 145L(end): pmuludq %mm7, %mm2 146 paddq %mm0, %mm6 147 movd %mm6, 0(%edx) 148 psrlq $32, %mm6 149 paddq %mm1, %mm6 150 movd %mm6, 4(%edx) 151 psrlq $32, %mm6 152 paddq %mm2, %mm6 153 movd %mm6, 8(%edx) 154 psrlq $32, %mm6 155 movd %mm6, %eax 156 emms 157 ret 158EPILOGUE() 159