1dnl AMD Athlon mpn_com -- mpn bitwise one's complement. 2 3dnl Copyright 2002 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C K7: 1.0 cycles/limb 35 36 37C void mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 38C 39C The loop form below is necessary for the claimed speed. It needs to be 40C aligned to a 16 byte boundary and only 16 bytes long. Maybe that's so it 41C fits in a BTB entry. The adjustments to %eax and %edx avoid offsets on 42C the movq's and achieve the necessary size. 43C 44C If both src and dst are 4mod8, the loop runs at 1.5 c/l. So long as one 45C of the two is 0mod8, it runs at 1.0 c/l. On that basis dst is checked 46C (offset by the size, as per the loop addressing) and one high limb 47C processed separately to get alignment. 48C 49C The padding for the nails case is unattractive, but shouldn't cost any 50C cycles. Explicit .byte's guarantee the desired instructions, at a point 51C where we're probably stalled waiting for loads anyway. 52C 53C Enhancements: 54C 55C The combination load/pxor/store might be able to be unrolled to approach 56C 0.5 c/l if desired. 57 58defframe(PARAM_SIZE,12) 59defframe(PARAM_SRC, 8) 60defframe(PARAM_DST, 4) 61 62 TEXT 63 ALIGN(16) 64 65PROLOGUE(mpn_com) 66deflit(`FRAME',0) 67 68 movl PARAM_DST, %edx 69 movl PARAM_SIZE, %ecx 70 pcmpeqd %mm7, %mm7 71 72 leal (%edx,%ecx,4), %eax 73 andl $4, %eax 74ifelse(GMP_NAIL_BITS,0,, 75` psrld $GMP_NAIL_BITS, %mm7') C GMP_NUMB_MASK 76 77 movl PARAM_SRC, %eax 78 movd -4(%eax,%ecx,4), %mm0 C src high limb 79 80ifelse(GMP_NAIL_BITS,0,, 81` C padding for alignment below 82 .byte 0x8d, 0xb6, 0x00, 0x00, 0x00, 0x00 C lea 0(%esi),%esi 83 .byte 0x8d, 0xbf, 0x00, 0x00, 0x00, 0x00 C lea 0(%edi),%edi 84') 85 86 jz L(aligned) 87 88 pxor %mm7, %mm0 89 movd %mm0, -4(%edx,%ecx,4) C dst high limb 90 decl %ecx 91 jz L(done) 92L(aligned): 93 94 addl $4, %eax 95 addl $4, %edx 96 decl %ecx 97 jz L(one) 98 99 C offset 0x30 for no nails, or 0x40 for nails 100 ALIGN(16) 101L(top): 102 C eax src 103 C ebx 104 C ecx counter 105 C edx dst 106 107 subl $2, %ecx 108 movq (%eax,%ecx,4), %mm0 109 pxor %mm7, %mm0 110 movq %mm0, (%edx,%ecx,4) 111 jg L(top) 112 113 jnz L(done) C if size even 114 115L(one): 116 movd -4(%eax), %mm0 C src low limb 117 pxor %mm7, %mm0 118 movd %mm0, -4(%edx) C dst low limb 119 120L(done): 121 emms 122 123 ret 124 125EPILOGUE() 126