1dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. 2 3dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2009, 2010, 2011, 2012 Free 4dnl Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way 26C AMD K10 0.67 this seems hard to beat 27C AMD bd1 1 28C AMD bobcat 1.07 29C Intel P4 7.35 terrible, use old code 30C Intel core2 1.25 1+epsilon with huge unrolling 31C Intel NHM 1.15 this seems hard to beat 32C Intel SBR 0.93 33C Intel atom 2.5 34C VIA nano 1.25 this seems hard to beat 35 36C INPUT PARAMETERS 37define(`ap', %rdi) 38define(`n', %rsi) 39 40C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) 41 42C TODO 43C * Review feed-in and wind-down code. 44 45ABI_SUPPORT(DOS64) 46ABI_SUPPORT(STD64) 47 48ASM_START() 49 TEXT 50 ALIGN(32) 51PROLOGUE(mpn_mod_34lsub1) 52 FUNC_ENTRY(2) 53 54 mov $0x0000FFFFFFFFFFFF, %r11 55 56 mov (ap), %rax 57 58 cmp $2, %rsi 59 ja L(gt2) 60 61 jb L(one) 62 63 mov 8(ap), %rsi 64 mov %rax, %rdx 65 shr $48, %rax C src[0] low 66 67 and %r11, %rdx C src[0] high 68 add %rdx, %rax 69 mov R32(%rsi), R32(%rdx) 70 71 shr $32, %rsi C src[1] high 72 add %rsi, %rax 73 74 shl $16, %rdx C src[1] low 75 add %rdx, %rax 76L(one): FUNC_EXIT() 77 ret 78 79 80C Don't change this, the wind-down code is not able to handle greater values 81define(UNROLL,3) 82 83L(gt2): mov 8(ap), %rcx 84 mov 16(ap), %rdx 85 xor %r9, %r9 86 add $24, ap 87 sub $eval(UNROLL*3+3), %rsi 88 jc L(end) 89 ALIGN(16) 90L(top): 91 add (ap), %rax 92 adc 8(ap), %rcx 93 adc 16(ap), %rdx 94 adc $0, %r9 95forloop(i,1,UNROLL-1,`dnl 96 add eval(i*24)(ap), %rax 97 adc eval(i*24+8)(ap), %rcx 98 adc eval(i*24+16)(ap), %rdx 99 adc $0, %r9 100')dnl 101 add $eval(UNROLL*24), ap 102 sub $eval(UNROLL*3), %rsi 103 jnc L(top) 104 105L(end): 106 lea L(tab)(%rip), %r8 107ifdef(`PIC', 108` movslq 36(%r8,%rsi,4), %r10 109 add %r10, %r8 110 jmp *%r8 111',` 112 jmp *72(%r8,%rsi,8) 113') 114 JUMPTABSECT 115 ALIGN(8) 116L(tab): JMPENT( L(0), L(tab)) 117 JMPENT( L(1), L(tab)) 118 JMPENT( L(2), L(tab)) 119 JMPENT( L(3), L(tab)) 120 JMPENT( L(4), L(tab)) 121 JMPENT( L(5), L(tab)) 122 JMPENT( L(6), L(tab)) 123 JMPENT( L(7), L(tab)) 124 JMPENT( L(8), L(tab)) 125 TEXT 126 127L(6): add (ap), %rax 128 adc 8(ap), %rcx 129 adc 16(ap), %rdx 130 adc $0, %r9 131 add $24, ap 132L(3): add (ap), %rax 133 adc 8(ap), %rcx 134 adc 16(ap), %rdx 135 jmp L(cj1) 136 137L(7): add (ap), %rax 138 adc 8(ap), %rcx 139 adc 16(ap), %rdx 140 adc $0, %r9 141 add $24, ap 142L(4): add (ap), %rax 143 adc 8(ap), %rcx 144 adc 16(ap), %rdx 145 adc $0, %r9 146 add $24, ap 147L(1): add (ap), %rax 148 adc $0, %rcx 149 jmp L(cj2) 150 151L(8): add (ap), %rax 152 adc 8(ap), %rcx 153 adc 16(ap), %rdx 154 adc $0, %r9 155 add $24, ap 156L(5): add (ap), %rax 157 adc 8(ap), %rcx 158 adc 16(ap), %rdx 159 adc $0, %r9 160 add $24, ap 161L(2): add (ap), %rax 162 adc 8(ap), %rcx 163 164L(cj2): adc $0, %rdx 165L(cj1): adc $0, %r9 166L(0): add %r9, %rax 167 adc $0, %rcx 168 adc $0, %rdx 169 adc $0, %rax 170 171 mov %rax, %rdi C 0mod3 172 shr $48, %rax C 0mod3 high 173 174 and %r11, %rdi C 0mod3 low 175 mov R32(%rcx), R32(%r10) C 1mod3 176 177 shr $32, %rcx C 1mod3 high 178 179 add %rdi, %rax C apply 0mod3 low 180 movzwl %dx, R32(%rdi) C 2mod3 181 shl $16, %r10 C 1mod3 low 182 183 add %rcx, %rax C apply 1mod3 high 184 shr $16, %rdx C 2mod3 high 185 186 add %r10, %rax C apply 1mod3 low 187 shl $32, %rdi C 2mod3 low 188 189 add %rdx, %rax C apply 2mod3 high 190 add %rdi, %rax C apply 2mod3 low 191 192 FUNC_EXIT() 193 ret 194EPILOGUE() 195