1dnl Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations. 2 3dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C P5: 3.0 c/l and, ior, xor 35C 3.5 c/l andn, iorn, nand, nior, xnor 36 37 38define(M4_choose_op, 39`ifdef(`OPERATION_$1',` 40define(`M4_function', `mpn_$1') 41define(`M4_want_pre', `$4') 42define(`M4op', `$3') 43define(`M4_want_post',`$2') 44')') 45define(M4pre, `ifelse(M4_want_pre, yes,`$1')') 46define(M4post,`ifelse(M4_want_post,yes,`$1')') 47 48M4_choose_op( and_n, , andl, ) 49M4_choose_op( andn_n, , andl, yes) 50M4_choose_op( nand_n, yes, andl, ) 51M4_choose_op( ior_n, , orl, ) 52M4_choose_op( iorn_n, , orl, yes) 53M4_choose_op( nior_n, yes, orl, ) 54M4_choose_op( xor_n, , xorl, ) 55M4_choose_op( xnor_n, yes, xorl, ) 56 57ifdef(`M4_function',, 58`m4_error(`Unrecognised or undefined OPERATION symbol 59')') 60 61MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 62 63NAILS_SUPPORT(0-31) 64 65 66C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size); 67C 68C Nothing complicated here, just some care to avoid data cache bank clashes 69C and AGIs. 70C 71C We're one register short of being able to do a simple 4 loads, 2 ops, 2 72C stores. Instead %ebp is juggled a bit and nops are introduced to keep the 73C pairings as intended. An in-place operation would free up a register, for 74C an 0.5 c/l speedup, if that's worth bothering with. 75C 76C This code seems best for P55 too. Data alignment is a big problem for MMX 77C and the pairing restrictions on movq and integer instructions make life 78C difficult. 79 80defframe(PARAM_SIZE,16) 81defframe(PARAM_YP, 12) 82defframe(PARAM_XP, 8) 83defframe(PARAM_WP, 4) 84 85 TEXT 86 ALIGN(8) 87 88PROLOGUE(M4_function) 89deflit(`FRAME',0) 90 91 pushl %ebx FRAME_pushl() 92 pushl %esi FRAME_pushl() 93 94 pushl %edi FRAME_pushl() 95 pushl %ebp FRAME_pushl() 96 97 movl PARAM_SIZE, %ecx 98 movl PARAM_XP, %ebx 99 100 movl PARAM_YP, %esi 101 movl PARAM_WP, %edi 102 103 shrl %ecx 104 jnc L(entry) 105 106 movl (%ebx,%ecx,8), %eax C risk of data cache bank clash here 107 movl (%esi,%ecx,8), %edx 108 109M4pre(` notl_or_xorl_GMP_NUMB_MASK(%edx)') 110 111 M4op %edx, %eax 112 113M4post(`xorl $GMP_NUMB_MASK, %eax') 114 orl %ecx, %ecx 115 116 movl %eax, (%edi,%ecx,8) 117 jz L(done) 118 119 jmp L(entry) 120 121 122L(top): 123 C eax 124 C ebx xp 125 C ecx counter, limb pairs, decrementing 126 C edx 127 C esi yp 128 C edi wp 129 C ebp 130 131 M4op %ebp, %edx 132 nop 133 134M4post(`xorl $GMP_NUMB_MASK, %eax') 135M4post(`xorl $GMP_NUMB_MASK, %edx') 136 137 movl %eax, 4(%edi,%ecx,8) 138 movl %edx, (%edi,%ecx,8) 139 140L(entry): 141 movl -4(%ebx,%ecx,8), %ebp 142 nop 143 144 movl -4(%esi,%ecx,8), %eax 145 movl -8(%esi,%ecx,8), %edx 146 147M4pre(` xorl $GMP_NUMB_MASK, %eax') 148M4pre(` xorl $GMP_NUMB_MASK, %edx') 149 150 M4op %ebp, %eax 151 movl -8(%ebx,%ecx,8), %ebp 152 153 decl %ecx 154 jnz L(top) 155 156 157 M4op %ebp, %edx 158 nop 159 160M4post(`xorl $GMP_NUMB_MASK, %eax') 161M4post(`xorl $GMP_NUMB_MASK, %edx') 162 163 movl %eax, 4(%edi,%ecx,8) 164 movl %edx, (%edi,%ecx,8) 165 166 167L(done): 168 popl %ebp 169 popl %edi 170 171 popl %esi 172 popl %ebx 173 174 ret 175 176EPILOGUE() 177