1dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of the GNU Lesser General Public License as published 9dnl by the Free Software Foundation; either version 3 of the License, or (at 10dnl your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, but 13dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15dnl License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20 21include(`../config.m4') 22 23C cycles/limb 24C P6 model 0-8,10-12 - 25C P6 model 9 (Banias) 5.24 26C P6 model 13 (Dothan) 5.24 27C P4 model 0-1 (Willamette) 5 28C P4 model 2 (Northwood) 5 29C P4 model 3-4 (Prescott) 5 30 31C TODO: 32C * Tweak eax/edx offsets in loop as to save some lea's 33C * Perhaps software pipeline small-case code 34 35C INPUT PARAMETERS 36C rp sp + 4 37C up sp + 8 38C n sp + 12 39C v0 sp + 16 40 41 TEXT 42 ALIGN(16) 43PROLOGUE(mpn_addmul_1) 44 pxor %mm6, %mm6 45L(ent): mov 4(%esp), %edx 46 mov 8(%esp), %eax 47 mov 12(%esp), %ecx 48 movd 16(%esp), %mm7 49 cmp $4, %ecx 50 jnc L(big) 51 52L(lp0): movd (%eax), %mm0 53 lea 4(%eax), %eax 54 movd (%edx), %mm4 55 lea 4(%edx), %edx 56 pmuludq %mm7, %mm0 57 paddq %mm0, %mm4 58 paddq %mm4, %mm6 59 movd %mm6, -4(%edx) 60 psrlq $32, %mm6 61 dec %ecx 62 jnz L(lp0) 63 movd %mm6, %eax 64 emms 65 ret 66 67L(big): and $3, %ecx 68 je L(0) 69 cmp $2, %ecx 70 jc L(1) 71 je L(2) 72 jmp L(3) C FIXME: one case should fall through 73 74L(0): movd (%eax), %mm3 75 sub 12(%esp), %ecx C loop count 76 lea -16(%eax), %eax 77 lea -12(%edx), %edx 78 pmuludq %mm7, %mm3 79 movd 20(%eax), %mm0 80 movd 12(%edx), %mm5 81 pmuludq %mm7, %mm0 82 movd 24(%eax), %mm1 83 paddq %mm3, %mm5 84 movd 16(%edx), %mm4 85 jmp L(00) 86 87L(1): movd (%eax), %mm2 88 sub 12(%esp), %ecx 89 lea -12(%eax), %eax 90 lea -8(%edx), %edx 91 movd 8(%edx), %mm4 92 pmuludq %mm7, %mm2 93 movd 16(%eax), %mm3 94 pmuludq %mm7, %mm3 95 movd 20(%eax), %mm0 96 paddq %mm2, %mm4 97 movd 12(%edx), %mm5 98 jmp L(01) 99 100L(2): movd (%eax), %mm1 101 sub 12(%esp), %ecx 102 lea -8(%eax), %eax 103 lea -4(%edx), %edx 104 pmuludq %mm7, %mm1 105 movd 12(%eax), %mm2 106 movd 4(%edx), %mm5 107 pmuludq %mm7, %mm2 108 movd 16(%eax), %mm3 109 paddq %mm1, %mm5 110 movd 8(%edx), %mm4 111 jmp L(10) 112 113L(3): movd (%eax), %mm0 114 sub 12(%esp), %ecx 115 lea -4(%eax), %eax 116 pmuludq %mm7, %mm0 117 movd 8(%eax), %mm1 118 movd (%edx), %mm4 119 pmuludq %mm7, %mm1 120 movd 12(%eax), %mm2 121 paddq %mm0, %mm4 122 movd 4(%edx), %mm5 123 124 ALIGN(16) 125L(top): pmuludq %mm7, %mm2 126 paddq %mm4, %mm6 127 movd 16(%eax), %mm3 128 paddq %mm1, %mm5 129 movd 8(%edx), %mm4 130 movd %mm6, 0(%edx) 131 psrlq $32, %mm6 132L(10): pmuludq %mm7, %mm3 133 paddq %mm5, %mm6 134 movd 20(%eax), %mm0 135 paddq %mm2, %mm4 136 movd 12(%edx), %mm5 137 movd %mm6, 4(%edx) 138 psrlq $32, %mm6 139L(01): pmuludq %mm7, %mm0 140 paddq %mm4, %mm6 141 movd 24(%eax), %mm1 142 paddq %mm3, %mm5 143 movd 16(%edx), %mm4 144 movd %mm6, 8(%edx) 145 psrlq $32, %mm6 146L(00): pmuludq %mm7, %mm1 147 paddq %mm5, %mm6 148 movd 28(%eax), %mm2 149 paddq %mm0, %mm4 150 movd 20(%edx), %mm5 151 movd %mm6, 12(%edx) 152 psrlq $32, %mm6 153 lea 16(%eax), %eax 154 lea 16(%edx), %edx 155 add $4, %ecx 156 jnz L(top) 157 158L(end): pmuludq %mm7, %mm2 159 paddq %mm4, %mm6 160 paddq %mm1, %mm5 161 movd 8(%edx), %mm4 162 movd %mm6, 0(%edx) 163 psrlq $32, %mm6 164 paddq %mm5, %mm6 165 paddq %mm2, %mm4 166 movd %mm6, 4(%edx) 167 psrlq $32, %mm6 168 paddq %mm4, %mm6 169 movd %mm6, 8(%edx) 170 psrlq $32, %mm6 171 movd %mm6, %eax 172 emms 173 ret 174EPILOGUE() 175PROLOGUE(mpn_addmul_1c) 176 movd 20(%esp), %mm6 177 jmp L(ent) 178EPILOGUE() 179