1dnl Alpha mpn_com -- mpn one's complement. 2 3dnl Copyright 2003 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33 34C cycles/limb 35C EV4: 4.75 36C EV5: 2.0 37C EV6: 1.5 38 39 40C mp_limb_t mpn_com (mp_ptr dst, mp_srcptr src, mp_size_t size); 41C 42C For ev5 the main loop is 7 cycles plus 1 taken branch bubble, for a total 43C 2.0 c/l. In general, a pattern like this unrolled to N limbs per loop 44C will be 1.5+2/N c/l. 45C 46C 2 cycles of loop control are unavoidable, for pointer updates and the 47C taken branch bubble, but also since ldq cannot issue two cycles after stq 48C (and with a run of stqs that means neither of two cycles at the end of the 49C loop. 50C 51C The fbeq is forced into the second cycle of the loop using unops, since 52C the first time through it must wait for the cvtqt result. Once that 53C result is ready (a 1 cycle stall) then both the branch and following loads 54C can issue together. 55C 56C The main loop handles an odd count of limbs, being two limbs loaded before 57C each size test, plus one pipelined around from the previous iteration (or 58C setup in the entry sequence). 59C 60C An even number of limbs is handled by an explicit dst[0]=~src[0] in the 61C entry sequence, and an increment of the pointers. For an odd size there's 62C no increment and the first store in the loop (r24) is a repeat of dst[0]. 63C 64C Note that the load for r24 after the possible pointer increment is done 65C before the explicit store to dst[0], in case src==dst. 66 67 68ASM_START() 69 70FLOAT64(L(dat), 2.0) 71 72 ALIGN(16) 73 74PROLOGUE(mpn_com,gp) 75 76 C r16 dst 77 C r17 src 78 C r18 size 79 80 lda r30, -16(r30) C temporary stack space 81 lda r7, -3(r18) C size - 3 82 83 ldq r20, 0(r17) C src[0] 84 srl r7, 1, r6 C (size-3)/2 85 86 stq r6, 8(r30) C (size-3)/2 87 and r7, 1, r5 C 1 if size even 88 89 LEA( r8, L(dat)) 90 s8addq r5, r17, r17 C skip src[0] if even 91 92 ornot r31, r20, r20 C ~src[0] 93 unop 94 95 ldt f0, 8(r30) C (size-3)/2 96 ldq r24, 0(r17) C src[0 or 1] 97 98 stq r20, 0(r16) C dst[0] 99 s8addq r5, r16, r19 C skip dst[0] if even 100 101 ldt f1, 0(r8) C data 2.0 102 lda r30, 16(r30) C restore stack 103 unop 104 cvtqt f0, f0 C (size-3)/2 as float 105 106 ornot r31, r24, r24 107 blt r7, L(done_1) C if size<=2 108 unop 109 unop 110 111 112 C 16-byte alignment here 113L(top): 114 C r17 src, incrementing 115 C r19 dst, incrementing 116 C r24 dst[i] result, ready to store 117 C f0 (size-3)/2, decrementing 118 C f1 2.0 119 120 ldq r20, 8(r17) C src[i+1] 121 ldq r21, 16(r17) C src[i+2] 122 unop 123 unop 124 125 fbeq f0, L(done_2) 126 unop 127 ldq r22, 24(r17) C src[i+3] 128 ldq r23, 32(r17) C src[i+4] 129 130 stq r24, 0(r19) C dst[i] 131 ornot r31, r20, r20 132 subt f0, f1, f0 C count -= 2 133 unop 134 135 stq r20, 8(r19) C dst[i+1] 136 ornot r31, r21, r21 137 unop 138 unop 139 140 stq r21, 16(r19) C dst[i+2] 141 ornot r31, r22, r22 142 143 stq r22, 24(r19) C dst[i+3] 144 ornot r31, r23, r24 145 146 lda r17, 32(r17) C src += 4 147 lda r19, 32(r19) C dst += 4 148 unop 149 fbge f0, L(top) 150 151 152L(done_1): 153 C r19 &dst[size-1] 154 C r24 result for dst[size-1] 155 156 stq r24, 0(r19) C dst[size-1] 157 ret r31, (r26), 1 158 159 160L(done_2): 161 C r19 &dst[size-3] 162 C r20 src[size-2] 163 C r21 src[size-1] 164 C r24 result for dst[size-3] 165 166 stq r24, 0(r19) C dst[size-3] 167 ornot r31, r20, r20 168 169 stq r20, 8(r19) C dst[size-2] 170 ornot r31, r21, r21 171 172 stq r21, 16(r19) C dst[size-1] 173 ret r31, (r26), 1 174 175EPILOGUE() 176ASM_END() 177