1dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. 2 3dnl Copyright 2000-2002, 2004, 2005, 2007, 2009-2012 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34 35C cycles/limb 36C AMD K8,K9 0.67 0.583 is possible with zero-reg instead of $0, 4-way 37C AMD K10 0.67 this seems hard to beat 38C AMD bd1 1 39C AMD bd2 1 40C AMD bd3 ? 41C AMD bd4 ? 42C AMD zen 0.62 43C AMD bobcat 1.07 44C AMD jaguar 1 45C Intel P4 7.35 terrible, use old code 46C Intel core2 1.25 1+epsilon with huge unrolling 47C Intel NHM 1.15 this seems hard to beat 48C Intel SBR 0.93 49C Intel IBR 0.93 50C Intel HWL 0.82 51C Intel BWL 0.64 52C Intel SKY 0.60 53C Intel atom 2.5 54C Intel SLM 1.59 55C VIA nano 1.25 this seems hard to beat 56 57C INPUT PARAMETERS 58define(`ap', %rdi) 59define(`n', %rsi) 60 61C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) 62 63C TODO 64C * Review feed-in and wind-down code. 65 66ABI_SUPPORT(DOS64) 67ABI_SUPPORT(STD64) 68 69ASM_START() 70 TEXT 71 ALIGN(32) 72PROLOGUE(mpn_mod_34lsub1) 73 FUNC_ENTRY(2) 74 75 mov $0x0000FFFFFFFFFFFF, %r11 76 77 mov (ap), %rax 78 79 cmp $2, %rsi 80 ja L(gt2) 81 82 jb L(one) 83 84 mov 8(ap), %rsi 85 mov %rax, %rdx 86 shr $48, %rax C src[0] low 87 88 and %r11, %rdx C src[0] high 89 add %rdx, %rax 90 mov R32(%rsi), R32(%rdx) 91 92 shr $32, %rsi C src[1] high 93 add %rsi, %rax 94 95 shl $16, %rdx C src[1] low 96 add %rdx, %rax 97L(one): FUNC_EXIT() 98 ret 99 100 101C Don't change this, the wind-down code is not able to handle greater values 102define(UNROLL,3) 103 104L(gt2): mov 8(ap), %rcx 105 mov 16(ap), %rdx 106 xor %r9, %r9 107 add $24, ap 108 sub $eval(UNROLL*3+3), %rsi 109 jc L(end) 110 ALIGN(16) 111L(top): 112 add (ap), %rax 113 adc 8(ap), %rcx 114 adc 16(ap), %rdx 115 adc $0, %r9 116forloop(i,1,UNROLL-1,`dnl 117 add eval(i*24)(ap), %rax 118 adc eval(i*24+8)(ap), %rcx 119 adc eval(i*24+16)(ap), %rdx 120 adc $0, %r9 121')dnl 122 add $eval(UNROLL*24), ap 123 sub $eval(UNROLL*3), %rsi 124 jnc L(top) 125 126L(end): 127 lea L(tab)(%rip), %r8 128ifdef(`PIC', 129` movslq 36(%r8,%rsi,4), %r10 130 add %r10, %r8 131 jmp *%r8 132',` 133 jmp *72(%r8,%rsi,8) 134') 135 JUMPTABSECT 136 ALIGN(8) 137L(tab): JMPENT( L(0), L(tab)) 138 JMPENT( L(1), L(tab)) 139 JMPENT( L(2), L(tab)) 140 JMPENT( L(3), L(tab)) 141 JMPENT( L(4), L(tab)) 142 JMPENT( L(5), L(tab)) 143 JMPENT( L(6), L(tab)) 144 JMPENT( L(7), L(tab)) 145 JMPENT( L(8), L(tab)) 146 TEXT 147 148L(6): add (ap), %rax 149 adc 8(ap), %rcx 150 adc 16(ap), %rdx 151 adc $0, %r9 152 add $24, ap 153L(3): add (ap), %rax 154 adc 8(ap), %rcx 155 adc 16(ap), %rdx 156 jmp L(cj1) 157 158L(7): add (ap), %rax 159 adc 8(ap), %rcx 160 adc 16(ap), %rdx 161 adc $0, %r9 162 add $24, ap 163L(4): add (ap), %rax 164 adc 8(ap), %rcx 165 adc 16(ap), %rdx 166 adc $0, %r9 167 add $24, ap 168L(1): add (ap), %rax 169 adc $0, %rcx 170 jmp L(cj2) 171 172L(8): add (ap), %rax 173 adc 8(ap), %rcx 174 adc 16(ap), %rdx 175 adc $0, %r9 176 add $24, ap 177L(5): add (ap), %rax 178 adc 8(ap), %rcx 179 adc 16(ap), %rdx 180 adc $0, %r9 181 add $24, ap 182L(2): add (ap), %rax 183 adc 8(ap), %rcx 184 185L(cj2): adc $0, %rdx 186L(cj1): adc $0, %r9 187L(0): add %r9, %rax 188 adc $0, %rcx 189 adc $0, %rdx 190 adc $0, %rax 191 192 mov %rax, %rdi C 0mod3 193 shr $48, %rax C 0mod3 high 194 195 and %r11, %rdi C 0mod3 low 196 mov R32(%rcx), R32(%r10) C 1mod3 197 198 shr $32, %rcx C 1mod3 high 199 200 add %rdi, %rax C apply 0mod3 low 201 movzwl %dx, R32(%rdi) C 2mod3 202 shl $16, %r10 C 1mod3 low 203 204 add %rcx, %rax C apply 1mod3 high 205 shr $16, %rdx C 2mod3 high 206 207 add %r10, %rax C apply 1mod3 low 208 shl $32, %rdi C 2mod3 low 209 210 add %rdx, %rax C apply 2mod3 high 211 add %rdi, %rax C apply 2mod3 low 212 213 FUNC_EXIT() 214 ret 215EPILOGUE() 216