1dnl AMD K7 mpn_mul_1. 2 3dnl Copyright 1999, 2000, 2001, 2002, 2005, 2008 Free Software Foundation, 4dnl Inc. 5dnl 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or 9dnl modify it under the terms of the GNU Lesser General Public License as 10dnl published by the Free Software Foundation; either version 3 of the 11dnl License, or (at your option) any later version. 12dnl 13dnl The GNU MP Library is distributed in the hope that it will be useful, 14dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 15dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16dnl Lesser General Public License for more details. 17dnl 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C P5 26C P6 model 0-8,10-12) 27C P6 model 9 (Banias) 28C P6 model 13 (Dothan) 29C P4 model 0 (Willamette) 30C P4 model 1 (?) 31C P4 model 2 (Northwood) 32C P4 model 3 (Prescott) 33C P4 model 4 (Nocona) 34C AMD K6 35C AMD K7 3.25 36C AMD K8 37 38C TODO 39C * Improve feed-in and wind-down code. We beat the old code for all n != 1, 40C but we might be able to do even better. 41C * The feed-in code for mul_1c is crude. 42 43ASM_START() 44 TEXT 45 ALIGN(16) 46PROLOGUE(mpn_mul_1c) 47 add $-16, %esp 48 mov %ebp, (%esp) 49 mov %ebx, 4(%esp) 50 mov %esi, 8(%esp) 51 mov %edi, 12(%esp) 52 53 mov 20(%esp), %edi 54 mov 24(%esp), %esi 55 mov 28(%esp), %ebp 56 mov 32(%esp), %ecx 57 mov %ebp, %ebx 58 shr $2, %ebp 59 mov %ebp, 28(%esp) 60 mov (%esi), %eax 61 and $3, %ebx 62 jz L(c0) 63 cmp $2, %ebx 64 mov 36(%esp), %ebx 65 jz L(c2) 66 jg L(c3) 67 68L(c1): lea -4(%edi), %edi 69 mul %ecx 70 test %ebp, %ebp 71 jnz 1f 72 add %ebx, %eax 73 mov %eax, 4(%edi) 74 mov %edx, %eax 75 adc %ebp, %eax 76 jmp L(rt) 771: add %eax, %ebx 78 mov $0, %ebp 79 adc %edx, %ebp 80 mov 4(%esi), %eax 81 jmp L(1) 82 83L(c2): lea 4(%esi), %esi 84 mul %ecx 85 test %ebp, %ebp 86 mov %ebx, %ebp 87 jnz 2f 88 add %eax, %ebp 89 mov $0, %ebx 90 adc %edx, %ebx 91 mov (%esi), %eax 92 jmp L(cj2) 932: add %eax, %ebp 94 mov $0, %ebx 95 adc %edx, %ebx 96 mov (%esi), %eax 97 jmp L(2) 98 99L(c3): lea 8(%esi), %esi 100 lea -12(%edi), %edi 101 mul %ecx 102 add %eax, %ebx 103 mov $0, %ebp 104 adc %edx, %ebp 105 mov -4(%esi), %eax 106 incl 28(%esp) 107 jmp L(3) 108 109L(c0): mov 36(%esp), %ebx 110 lea -4(%esi), %esi 111 lea -8(%edi), %edi 112 mul %ecx 113 mov %ebx, %ebp 114 add %eax, %ebp 115 mov $0, %ebx 116 adc %edx, %ebx 117 mov 8(%esi), %eax 118 jmp L(0) 119 120EPILOGUE() 121 ALIGN(16) 122PROLOGUE(mpn_mul_1) 123 add $-16, %esp 124 mov %ebp, (%esp) 125 mov %ebx, 4(%esp) 126 mov %esi, 8(%esp) 127 mov %edi, 12(%esp) 128 129 mov 20(%esp), %edi 130 mov 24(%esp), %esi 131 mov 28(%esp), %ebp 132 mov 32(%esp), %ecx 133 mov %ebp, %ebx 134 shr $2, %ebp 135 mov %ebp, 28(%esp) 136 mov (%esi), %eax 137 and $3, %ebx 138 jz L(b0) 139 cmp $2, %ebx 140 jz L(b2) 141 jg L(b3) 142 143L(b1): lea -4(%edi), %edi 144 mul %ecx 145 test %ebp, %ebp 146 jnz L(gt1) 147 mov %eax, 4(%edi) 148 mov %edx, %eax 149 jmp L(rt) 150L(gt1): mov %eax, %ebx 151 mov %edx, %ebp 152 mov 4(%esi), %eax 153 jmp L(1) 154 155L(b2): lea 4(%esi), %esi 156 mul %ecx 157 test %ebp, %ebp 158 mov %eax, %ebp 159 mov %edx, %ebx 160 mov (%esi), %eax 161 jnz L(2) 162 jmp L(cj2) 163 164L(b3): lea 8(%esi), %esi 165 lea -12(%edi), %edi 166 mul %ecx 167 mov %eax, %ebx 168 mov %edx, %ebp 169 mov -4(%esi), %eax 170 incl 28(%esp) 171 jmp L(3) 172 173L(b0): lea -4(%esi), %esi 174 lea -8(%edi), %edi 175 mul %ecx 176 mov %eax, %ebp 177 mov %edx, %ebx 178 mov 8(%esi), %eax 179 jmp L(0) 180 181 ALIGN(16) 182L(top): mov $0, %ebx 183 adc %edx, %ebx 184L(2): mul %ecx 185 add %eax, %ebx 186 mov %ebp, 0(%edi) 187 mov 4(%esi), %eax 188 mov $0, %ebp 189 adc %edx, %ebp 190L(1): mul %ecx 191 add %eax, %ebp 192 mov 8(%esi), %eax 193 mov %ebx, 4(%edi) 194 mov $0, %ebx 195 adc %edx, %ebx 196L(0): mov %ebp, 8(%edi) 197 mul %ecx 198 add %eax, %ebx 199 mov 12(%esi), %eax 200 lea 16(%esi), %esi 201 mov $0, %ebp 202 adc %edx, %ebp 203L(3): mov %ebx, 12(%edi) 204 mul %ecx 205 lea 16(%edi), %edi 206 add %eax, %ebp 207 decl 28(%esp) 208 mov 0(%esi), %eax 209 jnz L(top) 210 211L(end): mov $0, %ebx 212 adc %edx, %ebx 213L(cj2): mul %ecx 214 add %eax, %ebx 215 mov %ebp, (%edi) 216L(cj1): mov %ebx, 4(%edi) 217 adc $0, %edx 218 mov %edx, %eax 219 220L(rt): mov (%esp), %ebp 221 mov 4(%esp), %ebx 222 mov 8(%esp), %esi 223 mov 12(%esp), %edi 224 add $16, %esp 225 ret 226EPILOGUE() 227ASM_END() 228