1dnl PowerPC-32/VMX and PowerPC-64/VMX mpn_copyi. 2 3dnl Copyright 2006 Free Software Foundation, Inc. 4 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or modify 8dnl it under the terms of either: 9dnl 10dnl * the GNU Lesser General Public License as published by the Free 11dnl Software Foundation; either version 3 of the License, or (at your 12dnl option) any later version. 13dnl 14dnl or 15dnl 16dnl * the GNU General Public License as published by the Free Software 17dnl Foundation; either version 2 of the License, or (at your option) any 18dnl later version. 19dnl 20dnl or both in parallel, as here. 21dnl 22dnl The GNU MP Library is distributed in the hope that it will be useful, but 23dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 25dnl for more details. 26dnl 27dnl You should have received copies of the GNU General Public License and the 28dnl GNU Lesser General Public License along with the GNU MP Library. If not, 29dnl see https://www.gnu.org/licenses/. 30 31include(`../config.m4') 32 33C 16-byte coaligned unaligned 34C cycles/limb cycles/limb 35C 7400,7410 (G4): 0.5 0.64 36C 744x,745x (G4+): 0.75 0.82 37C 970 (G5): 0.78 1.02 (64-bit limbs) 38 39C STATUS 40C * Works for all sizes and alignments. 41 42C TODO 43C * Optimize unaligned case. Some basic tests with 2-way and 4-way unrolling 44C indicate that we can reach 0.56 c/l for 7400, 0.75 c/l for 745x, and 0.80 45C c/l for 970. 46C * Consider using VMX instructions also for head and tail, by using some 47C read-modify-write tricks. 48C * The VMX code is used from the smallest sizes it handles, but measurements 49C show a large speed bump at the cutoff points. Small copying (perhaps 50C using some read-modify-write technique) should be optimized. 51C * Make an mpn_com based on this code. 52 53define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8)) 54define(`LIMBS_PER_VR', eval(16/GMP_LIMB_BYTES)) 55define(`LIMBS_PER_2VR', eval(32/GMP_LIMB_BYTES)) 56 57 58ifelse(GMP_LIMB_BITS,32,` 59 define(`LIMB32',` $1') 60 define(`LIMB64',`') 61',` 62 define(`LIMB32',`') 63 define(`LIMB64',` $1') 64') 65 66C INPUT PARAMETERS 67define(`rp', `r3') 68define(`up', `r4') 69define(`n', `r5') 70 71define(`us', `v4') 72 73 74ASM_START() 75PROLOGUE(mpn_copyi) 76 77LIMB32(`cmpi cr7, n, 11 ') 78LIMB64(`cmpdi cr7, n, 5 ') 79 bge cr7, L(big) 80 81 or. r0, n, n 82 beqlr cr0 83 84C Handle small cases with plain operations 85 mtctr n 86L(topS): 87LIMB32(`lwz r0, 0(up) ') 88LIMB64(`ld r0, 0(up) ') 89 addi up, up, GMP_LIMB_BYTES 90LIMB32(`stw r0, 0(rp) ') 91LIMB64(`std r0, 0(rp) ') 92 addi rp, rp, GMP_LIMB_BYTES 93 bdnz L(topS) 94 blr 95 96C Handle large cases with VMX operations 97L(big): 98 mfspr r12, 256 99 oris r0, r12, 0xf800 C Set VRSAVE bit 0-4 100 mtspr 256, r0 101 102LIMB32(`rlwinm. r7, rp, 30,30,31') C (rp >> 2) mod 4 103LIMB64(`rlwinm. r7, rp, 29,31,31') C (rp >> 3) mod 2 104 beq L(rp_aligned) 105 106 subfic r7, r7, LIMBS_PER_VR 107 subf n, r7, n 108L(top0): 109LIMB32(`lwz r0, 0(up) ') 110LIMB64(`ld r0, 0(up) ') 111 addi up, up, GMP_LIMB_BYTES 112LIMB32(`addic. r7, r7, -1 ') 113LIMB32(`stw r0, 0(rp) ') 114LIMB64(`std r0, 0(rp) ') 115 addi rp, rp, GMP_LIMB_BYTES 116LIMB32(`bne L(top0) ') 117 118L(rp_aligned): 119 120LIMB32(`rlwinm. r0, up, 30,30,31') C (up >> 2) mod 4 121LIMB64(`rlwinm. r0, up, 29,31,31') C (up >> 3) mod 2 122 123LIMB64(`srdi r7, n, 2 ') C loop count corresponding to n 124LIMB32(`srwi r7, n, 3 ') C loop count corresponding to n 125 mtctr r7 C copy n to count register 126 127 li r10, 16 128 129 beq L(up_aligned) 130 131 lvsl us, 0, up 132 133LIMB32(`andi. r0, n, 0x4 ') 134LIMB64(`andi. r0, n, 0x2 ') 135 beq L(1) 136 lvx v0, 0, up 137 lvx v2, r10, up 138 vperm v3, v0, v2, us 139 stvx v3, 0, rp 140 addi up, up, 32 141 addi rp, rp, 16 142 b L(lpu) 143L(1): lvx v2, 0, up 144 addi up, up, 16 145 b L(lpu) 146 147 ALIGN(32) 148L(lpu): lvx v0, 0, up 149 vperm v3, v2, v0, us 150 stvx v3, 0, rp 151 lvx v2, r10, up 152 addi up, up, 32 153 vperm v3, v0, v2, us 154 stvx v3, r10, rp 155 addi rp, rp, 32 156 bdnz L(lpu) 157 158 addi up, up, -16 159 b L(tail) 160 161L(up_aligned): 162 163LIMB32(`andi. r0, n, 0x4 ') 164LIMB64(`andi. r0, n, 0x2 ') 165 beq L(lpa) 166 lvx v0, 0, up 167 stvx v0, 0, rp 168 addi up, up, 16 169 addi rp, rp, 16 170 b L(lpa) 171 172 ALIGN(32) 173L(lpa): lvx v0, 0, up 174 lvx v1, r10, up 175 addi up, up, 32 176 nop 177 stvx v0, 0, rp 178 stvx v1, r10, rp 179 addi rp, rp, 32 180 bdnz L(lpa) 181 182L(tail): 183LIMB32(`rlwinm. r7, n, 0,30,31 ') C r7 = n mod 4 184LIMB64(`rlwinm. r7, n, 0,31,31 ') C r7 = n mod 2 185 beq L(ret) 186LIMB32(`li r10, 0 ') 187L(top2): 188LIMB32(`lwzx r0, r10, up ') 189LIMB64(`ld r0, 0(up) ') 190LIMB32(`addic. r7, r7, -1 ') 191LIMB32(`stwx r0, r10, rp ') 192LIMB64(`std r0, 0(rp) ') 193LIMB32(`addi r10, r10, GMP_LIMB_BYTES') 194LIMB32(`bne L(top2) ') 195 196L(ret): mtspr 256, r12 197 blr 198EPILOGUE() 199