1dnl AMD K7 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple. 2 3dnl Copyright 1999-2002, 2005, 2008 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C P5 36C P6 model 0-8,10-12 37C P6 model 9 (Banias) 6.5 38C P6 model 13 (Dothan) 39C P4 model 0 (Willamette) 40C P4 model 1 (?) 41C P4 model 2 (Northwood) 42C P4 model 3 (Prescott) 43C P4 model 4 (Nocona) 44C AMD K6 45C AMD K7 3.75 46C AMD K8 47 48C TODO 49C * Improve feed-in and wind-down code. We beat the old code for all n != 1, 50C but lose by 2x for n == 1. 51 52ifdef(`OPERATION_addmul_1',` 53 define(`ADDSUB', `add') 54 define(`func', `mpn_addmul_1') 55') 56ifdef(`OPERATION_submul_1',` 57 define(`ADDSUB', `sub') 58 define(`func', `mpn_submul_1') 59') 60 61MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 62 63ASM_START() 64 TEXT 65 ALIGN(16) 66PROLOGUE(func) 67 add $-16, %esp 68 mov %ebp, (%esp) 69 mov %ebx, 4(%esp) 70 mov %esi, 8(%esp) 71 mov %edi, 12(%esp) 72 73 mov 20(%esp), %edi 74 mov 24(%esp), %esi 75 mov 28(%esp), %eax 76 mov 32(%esp), %ecx 77 mov %eax, %ebx 78 shr $2, %eax 79 mov %eax, 28(%esp) 80 mov (%esi), %eax 81 and $3, %ebx 82 jz L(b0) 83 cmp $2, %ebx 84 jz L(b2) 85 jg L(b3) 86 87L(b1): lea -4(%esi), %esi 88 lea -4(%edi), %edi 89 mul %ecx 90 mov %eax, %ebx 91 mov %edx, %ebp 92 cmpl $0, 28(%esp) 93 jz L(cj1) 94 mov 8(%esi), %eax 95 jmp L(1) 96 97L(b2): mul %ecx 98 mov %eax, %ebp 99 mov 4(%esi), %eax 100 mov %edx, %ebx 101 cmpl $0, 28(%esp) 102 jne L(2) 103 jmp L(cj2) 104 105L(b3): lea -12(%esi), %esi 106 lea -12(%edi), %edi 107 mul %ecx 108 mov %eax, %ebx 109 mov %edx, %ebp 110 mov 16(%esi), %eax 111 incl 28(%esp) 112 jmp L(3) 113 114L(b0): lea -8(%esi), %esi 115 lea -8(%edi), %edi 116 mul %ecx 117 mov %eax, %ebp 118 mov 12(%esi), %eax 119 mov %edx, %ebx 120 jmp L(0) 121 122 ALIGN(16) 123L(top): lea 16(%edi), %edi 124L(2): mul %ecx 125 ADDSUB %ebp, 0(%edi) 126 mov $0, %ebp 127 adc %eax, %ebx 128 mov 8(%esi), %eax 129 adc %edx, %ebp 130L(1): mul %ecx 131 ADDSUB %ebx, 4(%edi) 132 mov $0, %ebx 133 adc %eax, %ebp 134 mov 12(%esi), %eax 135 adc %edx, %ebx 136L(0): mul %ecx 137 ADDSUB %ebp, 8(%edi) 138 mov $0, %ebp 139 adc %eax, %ebx 140 adc %edx, %ebp 141 mov 16(%esi), %eax 142L(3): mul %ecx 143 ADDSUB %ebx, 12(%edi) 144 adc %eax, %ebp 145 mov 20(%esi), %eax 146 lea 16(%esi), %esi 147 mov $0, %ebx 148 adc %edx, %ebx 149 decl 28(%esp) 150 jnz L(top) 151 152L(end): lea 16(%edi), %edi 153L(cj2): mul %ecx 154 ADDSUB %ebp, (%edi) 155 adc %eax, %ebx 156 adc $0, %edx 157L(cj1): ADDSUB %ebx, 4(%edi) 158 adc $0, %edx 159 mov %edx, %eax 160 mov (%esp), %ebp 161 mov 4(%esp), %ebx 162 mov 8(%esp), %esi 163 mov 12(%esp), %edi 164 add $16, %esp 165 ret 166EPILOGUE() 167ASM_END() 168