1dnl mpn_addmul_1 for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F). 2 3dnl Copyright 2005, 2007, 2011 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31 32include(`../config.m4') 33 34C cycles/limb 35C P6 model 0-8,10-12 - 36C P6 model 9 (Banias) 5.24 37C P6 model 13 (Dothan) 5.24 38C P4 model 0-1 (Willamette) 5 39C P4 model 2 (Northwood) 5 40C P4 model 3-4 (Prescott) 5 41 42C TODO: 43C * Tweak eax/edx offsets in loop as to save some lea's 44C * Perhaps software pipeline small-case code 45 46C INPUT PARAMETERS 47C rp sp + 4 48C up sp + 8 49C n sp + 12 50C v0 sp + 16 51 52 TEXT 53 ALIGN(16) 54PROLOGUE(mpn_addmul_1) 55 pxor %mm6, %mm6 56L(ent): mov 4(%esp), %edx 57 mov 8(%esp), %eax 58 mov 12(%esp), %ecx 59 movd 16(%esp), %mm7 60 cmp $4, %ecx 61 jnc L(big) 62 63L(lp0): movd (%eax), %mm0 64 lea 4(%eax), %eax 65 movd (%edx), %mm4 66 lea 4(%edx), %edx 67 pmuludq %mm7, %mm0 68 paddq %mm0, %mm4 69 paddq %mm4, %mm6 70 movd %mm6, -4(%edx) 71 psrlq $32, %mm6 72 dec %ecx 73 jnz L(lp0) 74 movd %mm6, %eax 75 emms 76 ret 77 78L(big): and $3, %ecx 79 je L(0) 80 cmp $2, %ecx 81 jc L(1) 82 je L(2) 83 jmp L(3) C FIXME: one case should fall through 84 85L(0): movd (%eax), %mm3 86 sub 12(%esp), %ecx C loop count 87 lea -16(%eax), %eax 88 lea -12(%edx), %edx 89 pmuludq %mm7, %mm3 90 movd 20(%eax), %mm0 91 movd 12(%edx), %mm5 92 pmuludq %mm7, %mm0 93 movd 24(%eax), %mm1 94 paddq %mm3, %mm5 95 movd 16(%edx), %mm4 96 jmp L(00) 97 98L(1): movd (%eax), %mm2 99 sub 12(%esp), %ecx 100 lea -12(%eax), %eax 101 lea -8(%edx), %edx 102 movd 8(%edx), %mm4 103 pmuludq %mm7, %mm2 104 movd 16(%eax), %mm3 105 pmuludq %mm7, %mm3 106 movd 20(%eax), %mm0 107 paddq %mm2, %mm4 108 movd 12(%edx), %mm5 109 jmp L(01) 110 111L(2): movd (%eax), %mm1 112 sub 12(%esp), %ecx 113 lea -8(%eax), %eax 114 lea -4(%edx), %edx 115 pmuludq %mm7, %mm1 116 movd 12(%eax), %mm2 117 movd 4(%edx), %mm5 118 pmuludq %mm7, %mm2 119 movd 16(%eax), %mm3 120 paddq %mm1, %mm5 121 movd 8(%edx), %mm4 122 jmp L(10) 123 124L(3): movd (%eax), %mm0 125 sub 12(%esp), %ecx 126 lea -4(%eax), %eax 127 pmuludq %mm7, %mm0 128 movd 8(%eax), %mm1 129 movd (%edx), %mm4 130 pmuludq %mm7, %mm1 131 movd 12(%eax), %mm2 132 paddq %mm0, %mm4 133 movd 4(%edx), %mm5 134 135 ALIGN(16) 136L(top): pmuludq %mm7, %mm2 137 paddq %mm4, %mm6 138 movd 16(%eax), %mm3 139 paddq %mm1, %mm5 140 movd 8(%edx), %mm4 141 movd %mm6, 0(%edx) 142 psrlq $32, %mm6 143L(10): pmuludq %mm7, %mm3 144 paddq %mm5, %mm6 145 movd 20(%eax), %mm0 146 paddq %mm2, %mm4 147 movd 12(%edx), %mm5 148 movd %mm6, 4(%edx) 149 psrlq $32, %mm6 150L(01): pmuludq %mm7, %mm0 151 paddq %mm4, %mm6 152 movd 24(%eax), %mm1 153 paddq %mm3, %mm5 154 movd 16(%edx), %mm4 155 movd %mm6, 8(%edx) 156 psrlq $32, %mm6 157L(00): pmuludq %mm7, %mm1 158 paddq %mm5, %mm6 159 movd 28(%eax), %mm2 160 paddq %mm0, %mm4 161 movd 20(%edx), %mm5 162 movd %mm6, 12(%edx) 163 psrlq $32, %mm6 164 lea 16(%eax), %eax 165 lea 16(%edx), %edx 166 add $4, %ecx 167 jnz L(top) 168 169L(end): pmuludq %mm7, %mm2 170 paddq %mm4, %mm6 171 paddq %mm1, %mm5 172 movd 8(%edx), %mm4 173 movd %mm6, 0(%edx) 174 psrlq $32, %mm6 175 paddq %mm5, %mm6 176 paddq %mm2, %mm4 177 movd %mm6, 4(%edx) 178 psrlq $32, %mm6 179 paddq %mm4, %mm6 180 movd %mm6, 8(%edx) 181 psrlq $32, %mm6 182 movd %mm6, %eax 183 emms 184 ret 185EPILOGUE() 186PROLOGUE(mpn_addmul_1c) 187 movd 20(%esp), %mm6 188 jmp L(ent) 189EPILOGUE() 190