1dnl AMD64 mpn_mod_34lsub1 -- remainder modulo 2^48-1. 2 3dnl Copyright 2000, 2001, 2002, 2004, 2005, 2007, 2010, 2011, 2012 Free 4dnl Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23 24C cycles/limb 25C AMD K8,K9 1.0 26C AMD K10 1.12 27C Intel P4 3.25 28C Intel core2 1.5 29C Intel corei 1.5 30C Intel atom 2.5 31C VIA nano 1.75 32 33 34C INPUT PARAMETERS 35define(`ap', %rdi) 36define(`n', %rsi) 37 38C mp_limb_t mpn_mod_34lsub1 (mp_srcptr up, mp_size_t n) 39 40C TODO 41C * Review feed-in and wind-down code. In particular, try to avoid adc and 42C sbb to placate Pentium4. 43C * It seems possible to reach 2.67 c/l by using a cleaner 6-way unrolling, 44C without the dual loop exits. 45 46ABI_SUPPORT(DOS64) 47ABI_SUPPORT(STD64) 48 49ASM_START() 50 TEXT 51 ALIGN(32) 52PROLOGUE(mpn_mod_34lsub1) 53 FUNC_ENTRY(2) 54 55 mov $0x0000FFFFFFFFFFFF, %r11 56 57 sub $2, %rsi 58 ja L(gt2) 59 60 mov (ap), %rax 61 nop 62 jb L(1) 63 64 mov 8(ap), %rsi 65 mov %rax, %rdx 66 shr $48, %rax C src[0] low 67 68 and %r11, %rdx C src[0] high 69 add %rdx, %rax 70 mov R32(%rsi), R32(%rdx) 71 72 shr $32, %rsi C src[1] high 73 add %rsi, %rax 74 75 shl $16, %rdx C src[1] low 76 add %rdx, %rax 77 78L(1): FUNC_EXIT() 79 ret 80 81 82 ALIGN(16) 83L(gt2): xor R32(%rax), R32(%rax) 84 xor R32(%rcx), R32(%rcx) 85 xor R32(%rdx), R32(%rdx) 86 xor %r8, %r8 87 xor %r9, %r9 88 xor %r10, %r10 89 90L(top): add (ap), %rax 91 adc $0, %r10 92 add 8(ap), %rcx 93 adc $0, %r8 94 add 16(ap), %rdx 95 adc $0, %r9 96 97 sub $3, %rsi 98 jng L(end) 99 100 add 24(ap), %rax 101 adc $0, %r10 102 add 32(ap), %rcx 103 adc $0, %r8 104 add 40(ap), %rdx 105 lea 48(ap), ap 106 adc $0, %r9 107 108 sub $3, %rsi 109 jg L(top) 110 111 112 add $-24, ap 113L(end): add %r9, %rax 114 adc %r10, %rcx 115 adc %r8, %rdx 116 117 inc %rsi 118 mov $0x1, R32(%r10) 119 js L(combine) 120 121 mov $0x10000, R32(%r10) 122 adc 24(ap), %rax 123 dec %rsi 124 js L(combine) 125 126 adc 32(ap), %rcx 127 mov $0x100000000, %r10 128 129L(combine): 130 sbb %rsi, %rsi C carry 131 mov %rax, %rdi C 0mod3 132 shr $48, %rax C 0mod3 high 133 134 and %r10, %rsi C carry masked 135 and %r11, %rdi C 0mod3 low 136 mov R32(%rcx), R32(%r10) C 1mod3 137 138 add %rsi, %rax C apply carry 139 shr $32, %rcx C 1mod3 high 140 141 add %rdi, %rax C apply 0mod3 low 142 movzwl %dx, R32(%rdi) C 2mod3 143 shl $16, %r10 C 1mod3 low 144 145 add %rcx, %rax C apply 1mod3 high 146 shr $16, %rdx C 2mod3 high 147 148 add %r10, %rax C apply 1mod3 low 149 shl $32, %rdi C 2mod3 low 150 151 add %rdx, %rax C apply 2mod3 high 152 add %rdi, %rax C apply 2mod3 low 153 154 FUNC_EXIT() 155 ret 156EPILOGUE() 157