1dnl Intel Atom mpn_addlshC_n/mpn_sublshC_n -- rp[] = up[] +- (vp[] << C) 2 3dnl Contributed to the GNU project by Marco Bodrato. 4 5dnl Copyright 2011 Free Software Foundation, Inc. 6dnl 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or 10dnl modify it under the terms of the GNU Lesser General Public License as 11dnl published by the Free Software Foundation; either version 3 of the 12dnl License, or (at your option) any later version. 13dnl 14dnl The GNU MP Library is distributed in the hope that it will be useful, 15dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 16dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17dnl Lesser General Public License for more details. 18dnl 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C mp_limb_t mpn_addlshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size); 25C mp_limb_t mpn_addlshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 26C mp_limb_t carry); 27C mp_limb_t mpn_sublshC_n_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size,); 28C mp_limb_t mpn_sublshC_nc_ip1 (mp_ptr dst, mp_srcptr src, mp_size_t size, 29C mp_signed_limb_t borrow); 30 31defframe(PARAM_CORB, 16) 32defframe(PARAM_SIZE, 12) 33defframe(PARAM_SRC, 8) 34defframe(PARAM_DST, 4) 35 36C mp_limb_t mpn_addlshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 37C mp_size_t size,); 38C mp_limb_t mpn_addlshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 39C mp_size_t size, mp_limb_t carry); 40C mp_limb_t mpn_sublshC_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 41C mp_size_t size,); 42C mp_limb_t mpn_sublshC_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2, 43C mp_size_t size, mp_limb_t borrow); 44 45C if src1 == dst, _ip1 is used 46 47C cycles/limb 48C dst!=src1,src2 dst==src1 49C P5 50C P6 model 0-8,10-12 51C P6 model 9 (Banias) 52C P6 model 13 (Dothan) 53C P4 model 0 (Willamette) 54C P4 model 1 (?) 55C P4 model 2 (Northwood) 56C P4 model 3 (Prescott) 57C P4 model 4 (Nocona) 58C Intel Atom 7 6 59C AMD K6 60C AMD K7 61C AMD K8 62C AMD K10 63 64defframe(GPARAM_CORB, 20) 65defframe(GPARAM_SIZE, 16) 66defframe(GPARAM_SRC2, 12) 67 68dnl re-use parameter space 69define(SAVE_EBP,`PARAM_SIZE') 70define(SAVE_EBX,`PARAM_SRC') 71define(SAVE_UP,`PARAM_DST') 72 73define(M, eval(m4_lshift(1,LSH))) 74define(`rp', `%edi') 75define(`up', `%esi') 76 77ASM_START() 78 TEXT 79 ALIGN(8) 80 81PROLOGUE(M4_ip_function_c) 82deflit(`FRAME',0) 83 movl PARAM_CORB, %ecx 84 movl %ecx, %edx 85 shr $LSH, %edx 86 andl $1, %edx 87 M4_opp %edx, %ecx 88 jmp L(start_nc) 89EPILOGUE() 90 91PROLOGUE(M4_ip_function) 92deflit(`FRAME',0) 93 94 xor %ecx, %ecx 95 xor %edx, %edx 96L(start_nc): 97 push rp FRAME_pushl() 98 mov PARAM_DST, rp 99 mov up, SAVE_UP 100 mov PARAM_SRC, up 101 mov %ebx, SAVE_EBX 102 mov PARAM_SIZE, %ebx C size 103L(inplace): 104 incl %ebx C size + 1 105 shr %ebx C (size+1)\2 106 mov %ebp, SAVE_EBP 107 jnc L(entry) C size odd 108 109 add %edx, %edx C size even 110 mov %ecx, %ebp 111 mov (up), %ecx 112 lea -4(rp), rp 113 lea (%ebp,%ecx,M), %eax 114 lea 4(up), up 115 jmp L(enteven) 116 117 ALIGN(16) 118L(oop): 119 lea (%ecx,%eax,M), %ebp 120 shr $RSH, %eax 121 mov 4(up), %ecx 122 add %edx, %edx 123 lea 8(up), up 124 M4_inst %ebp, (rp) 125 lea (%eax,%ecx,M), %eax 126 127L(enteven): 128 M4_inst %eax, 4(rp) 129 lea 8(rp), rp 130 131 sbb %edx, %edx 132 shr $RSH, %ecx 133 134L(entry): 135 mov (up), %eax 136 decl %ebx 137 jnz L(oop) 138 139 lea (%ecx,%eax,M), %ebp 140 shr $RSH, %eax 141 shr %edx 142 M4_inst %ebp, (rp) 143 mov SAVE_UP, up 144 adc $0, %eax 145 mov SAVE_EBP, %ebp 146 mov SAVE_EBX, %ebx 147 pop rp FRAME_popl() 148 ret 149EPILOGUE() 150 151PROLOGUE(M4_function_c) 152deflit(`FRAME',0) 153 movl GPARAM_CORB, %ecx 154 movl %ecx, %edx 155 shr $LSH, %edx 156 andl $1, %edx 157 M4_opp %edx, %ecx 158 jmp L(generic_nc) 159EPILOGUE() 160 161PROLOGUE(M4_function) 162deflit(`FRAME',0) 163 164 xor %ecx, %ecx 165 xor %edx, %edx 166L(generic_nc): 167 push rp FRAME_pushl() 168 mov PARAM_DST, rp 169 mov up, SAVE_UP 170 mov PARAM_SRC, up 171 cmp rp, up 172 mov %ebx, SAVE_EBX 173 jne L(general) 174 mov GPARAM_SIZE, %ebx C size 175 mov GPARAM_SRC2, up 176 jmp L(inplace) 177 178L(general): 179 mov GPARAM_SIZE, %eax C size 180 mov %ebx, SAVE_EBX 181 incl %eax C size + 1 182 mov up, %ebx C vp 183 mov GPARAM_SRC2, up C up 184 shr %eax C (size+1)\2 185 mov %ebp, SAVE_EBP 186 mov %eax, GPARAM_SIZE 187 jnc L(entry2) C size odd 188 189 add %edx, %edx C size even 190 mov %ecx, %ebp 191 mov (up), %ecx 192 lea -4(rp), rp 193 lea -4(%ebx), %ebx 194 lea (%ebp,%ecx,M), %eax 195 lea 4(up), up 196 jmp L(enteven2) 197 198 ALIGN(16) 199L(oop2): 200 lea (%ecx,%eax,M), %ebp 201 shr $RSH, %eax 202 mov 4(up), %ecx 203 add %edx, %edx 204 lea 8(up), up 205 mov (%ebx), %edx 206 M4_inst %ebp, %edx 207 lea (%eax,%ecx,M), %eax 208 mov %edx, (rp) 209L(enteven2): 210 mov 4(%ebx), %edx 211 lea 8(%ebx), %ebx 212 M4_inst %eax, %edx 213 mov %edx, 4(rp) 214 sbb %edx, %edx 215 shr $RSH, %ecx 216 lea 8(rp), rp 217L(entry2): 218 mov (up), %eax 219 decl GPARAM_SIZE 220 jnz L(oop2) 221 222 lea (%ecx,%eax,M), %ebp 223 shr $RSH, %eax 224 shr %edx 225 mov (%ebx), %edx 226 M4_inst %ebp, %edx 227 mov %edx, (rp) 228 mov SAVE_UP, up 229 adc $0, %eax 230 mov SAVE_EBP, %ebp 231 mov SAVE_EBX, %ebx 232 pop rp FRAME_popl() 233 ret 234EPILOGUE() 235 236ASM_END() 237