1dnl Intel Pentium mpn_and_n,...,mpn_xnor_n -- bitwise logical operations. 2 3dnl Copyright 2001, 2002 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22 23C P5: 3.0 c/l and, ior, xor 24C 3.5 c/l andn, iorn, nand, nior, xnor 25 26 27define(M4_choose_op, 28`ifdef(`OPERATION_$1',` 29define(`M4_function', `mpn_$1') 30define(`M4_want_pre', `$4') 31define(`M4op', `$3') 32define(`M4_want_post',`$2') 33')') 34define(M4pre, `ifelse(M4_want_pre, yes,`$1')') 35define(M4post,`ifelse(M4_want_post,yes,`$1')') 36 37M4_choose_op( and_n, , andl, ) 38M4_choose_op( andn_n, , andl, yes) 39M4_choose_op( nand_n, yes, andl, ) 40M4_choose_op( ior_n, , orl, ) 41M4_choose_op( iorn_n, , orl, yes) 42M4_choose_op( nior_n, yes, orl, ) 43M4_choose_op( xor_n, , xorl, ) 44M4_choose_op( xnor_n, yes, xorl, ) 45 46ifdef(`M4_function',, 47`m4_error(`Unrecognised or undefined OPERATION symbol 48')') 49 50MULFUNC_PROLOGUE(mpn_and_n mpn_andn_n mpn_nand_n mpn_ior_n mpn_iorn_n mpn_nior_n mpn_xor_n mpn_xnor_n) 51 52NAILS_SUPPORT(0-31) 53 54 55C void M4_function (mp_ptr wp, mp_srcptr xp, mp_srcptr yp, mp_size_t size); 56C 57C Nothing complicated here, just some care to avoid data cache bank clashes 58C and AGIs. 59C 60C We're one register short of being able to do a simple 4 loads, 2 ops, 2 61C stores. Instead %ebp is juggled a bit and nops are introduced to keep the 62C pairings as intended. An in-place operation would free up a register, for 63C an 0.5 c/l speedup, if that's worth bothering with. 64C 65C This code seems best for P55 too. Data alignment is a big problem for MMX 66C and the pairing restrictions on movq and integer instructions make life 67C difficult. 68 69defframe(PARAM_SIZE,16) 70defframe(PARAM_YP, 12) 71defframe(PARAM_XP, 8) 72defframe(PARAM_WP, 4) 73 74 TEXT 75 ALIGN(8) 76 77PROLOGUE(M4_function) 78deflit(`FRAME',0) 79 80 pushl %ebx FRAME_pushl() 81 pushl %esi FRAME_pushl() 82 83 pushl %edi FRAME_pushl() 84 pushl %ebp FRAME_pushl() 85 86 movl PARAM_SIZE, %ecx 87 movl PARAM_XP, %ebx 88 89 movl PARAM_YP, %esi 90 movl PARAM_WP, %edi 91 92 shrl %ecx 93 jnc L(entry) 94 95 movl (%ebx,%ecx,8), %eax C risk of data cache bank clash here 96 movl (%esi,%ecx,8), %edx 97 98M4pre(` notl_or_xorl_GMP_NUMB_MASK(%edx)') 99 100 M4op %edx, %eax 101 102M4post(`xorl $GMP_NUMB_MASK, %eax') 103 orl %ecx, %ecx 104 105 movl %eax, (%edi,%ecx,8) 106 jz L(done) 107 108 jmp L(entry) 109 110 111L(top): 112 C eax 113 C ebx xp 114 C ecx counter, limb pairs, decrementing 115 C edx 116 C esi yp 117 C edi wp 118 C ebp 119 120 M4op %ebp, %edx 121 nop 122 123M4post(`xorl $GMP_NUMB_MASK, %eax') 124M4post(`xorl $GMP_NUMB_MASK, %edx') 125 126 movl %eax, 4(%edi,%ecx,8) 127 movl %edx, (%edi,%ecx,8) 128 129L(entry): 130 movl -4(%ebx,%ecx,8), %ebp 131 nop 132 133 movl -4(%esi,%ecx,8), %eax 134 movl -8(%esi,%ecx,8), %edx 135 136M4pre(` xorl $GMP_NUMB_MASK, %eax') 137M4pre(` xorl $GMP_NUMB_MASK, %edx') 138 139 M4op %ebp, %eax 140 movl -8(%ebx,%ecx,8), %ebp 141 142 decl %ecx 143 jnz L(top) 144 145 146 M4op %ebp, %edx 147 nop 148 149M4post(`xorl $GMP_NUMB_MASK, %eax') 150M4post(`xorl $GMP_NUMB_MASK, %edx') 151 152 movl %eax, 4(%edi,%ecx,8) 153 movl %edx, (%edi,%ecx,8) 154 155 156L(done): 157 popl %ebp 158 popl %edi 159 160 popl %esi 161 popl %ebx 162 163 ret 164 165EPILOGUE() 166