1dnl Power9 mpn_addmul_2. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2018 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C power9: 1.62 26 27C STATUS 28C * Not written with any power9 pipeline understanding. 29C * The 4x unrolling was not motivated by any timing tests. 30C * No local scheduling for performance tweaking has been done. 31C * Decrease load scheduling! 32 33define(`rp', `r3') 34define(`up', `r4') 35define(`n', `r5') C Note: Reused as scratch 36define(`vp', `r6') C Note: Reused for v1 37 38define(`v0', `r7') 39define(`v1', `r6') 40 41 42ASM_START() 43PROLOGUE(mpn_addmul_2) 44 std r26, -48(r1) 45 std r27, -40(r1) 46 std r28, -32(r1) 47 std r29, -24(r1) 48 std r30, -16(r1) 49 std r31, -8(r1) 50 51 subfic r0, r1, 0 C clear CA 52 subfo r0, r0, r0 C clear OV and r0 53 54 cmpdi cr7, n, 4 55 56 ld v0, 0(vp) 57 ld v1, 8(vp) 58 59 srdi r10, n, 2 60 mtctr r10 61 62 rldicl. r9, n, 0, 63 63 bne cr0, L(bx1) 64 65L(bx0): rldicl. r9, n, 63, 63 66 67 ld r28, 0(rp) 68 ld r8, 0(up) 69 ld r11, 8(rp) 70 ld r9, 8(up) 71 maddld( r26, r8, v0, r28) 72 maddhdu(r31, r8, v0, r28) 73 blt cr7, L(2) 74 ld r28, 16(rp) 75 mulld r5, r8, v1 76 mulhdu r10, r8, v1 77 bne cr0, L(b10) 78 79L(b00): addi up, up, -8 80 addi rp, rp, -24 81 b L(lo0) 82 83L(b10): addi up, up, 8 84 addi rp, rp, -8 85 b L(lo2) 86 87L(2): addi rp, rp, -8 88 mulld r5, r8, v1 89 mulhdu r10, r8, v1 90 b L(cj2) 91 92L(bx1): rldicl. r9, n, 63, 63 93 94 ld r29, 0(rp) 95 ld r9, 0(up) 96 ld r10, 8(rp) 97 ld r8, 8(up) 98 maddld( r27, r9, v0, r29) 99 maddhdu(r30, r9, v0, r29) 100 ld r29, 16(rp) 101 mulld r12, r9, v1 102 mulhdu r11, r9, v1 103 bne cr0, L(b11) 104 105L(b01): addi rp, rp, -16 106 b L(lo1) 107L(b11): addi up, up, 16 108 blt cr7, L(end) 109 110L(top): ld r9, 0(up) 111 maddld( r26, r8, v0, r10) C 0 4 -> adde 112 maddhdu(r31, r8, v0, r10) C 1 5 113 adde r0, r27, r0 C 7 11 114 ld r28, 24(rp) 115 std r0, 0(rp) 116 maddld( r5, r8, v1, r29) C 1 5 -> addex 117 maddhdu(r10, r8, v1, r29) C 2 6 118 addex( r0, r12, r30, 0) C 8 12 119L(lo2): ld r8, 8(up) 120 maddld( r27, r9, v0, r11) C 1 5 -> adde 121 maddhdu(r30, r9, v0, r11) C 2 6 122 adde r0, r26, r0 C 8 12 123 ld r29, 32(rp) 124 std r0, 8(rp) 125 maddld( r12, r9, v1, r28) C 2 6 -> addex 126 maddhdu(r11, r9, v1, r28) C 3 7 127 addex( r0, r5, r31, 0) C 5 9 13 128L(lo1): ld r9, 16(up) 129 maddld( r26, r8, v0, r10) C 2 6 -> adde 130 maddhdu(r31, r8, v0, r10) C 3 7 131 adde r0, r27, r0 C 5 9 13 132 ld r28, 40(rp) 133 std r0, 16(rp) 134 maddld( r5, r8, v1, r29) C 3 7 -> addex 135 maddhdu(r10, r8, v1, r29) C 4 8 136 addex( r0, r12, r30, 0) C 6 10 137L(lo0): ld r8, 24(up) 138 maddld( r27, r9, v0, r11) C 3 7 -> adde 139 maddhdu(r30, r9, v0, r11) C 4 8 140 adde r0, r26, r0 C 6 10 141 ld r29, 48(rp) 142 std r0, 24(rp) 143 maddld( r12, r9, v1, r28) C 4 8 -> addex 144 maddhdu(r11, r9, v1, r28) C 5 9 145 addex( r0, r5, r31, 0) C 7 11 146 addi up, up, 32 147 addi rp, rp, 32 148 bdnz L(top) 149 150L(end): ld r9, 0(up) 151 maddld( r26, r8, v0, r10) C 0 4 152 maddhdu(r31, r8, v0, r10) C 1 5 153 adde r0, r27, r0 C 7 11 154 std r0, 0(rp) C -4 155 maddld( r5, r8, v1, r29) C 1 5 156 maddhdu(r10, r8, v1, r29) C 2 6 157 addex( r0, r12, r30, 0) C 8 12 158L(cj2): maddld( r27, r9, v0, r11) C 1 5 -2 159 maddhdu(r30, r9, v0, r11) C 2 6 -1 160 adde r0, r26, r0 C 8 12 -3 161 std r0, 8(rp) C -3 162 mulld r12, r9, v1 C 2 6 -1 163 mulhdu r11, r9, v1 C 3 7 0 = return limb 164 addex( r0, r5, r31, 0) C 5 9 13 165 adde r0, r27, r0 C 5 9 13 -2 166 std r0, 16(rp) C -2 167 addex( r0, r12, r30, 0) C 6 10 -1 168 adde r0, r0, r10 C -1 169 std r0, 24(rp) C -1 170 li r4, 0 171 addze r3, r11 172 addex( r3, r3, r4, 0) 173 174L(ret): ld r26, -48(r1) 175 ld r27, -40(r1) 176 ld r28, -32(r1) 177 ld r29, -24(r1) 178 ld r30, -16(r1) 179 ld r31, -8(r1) 180 blr 181EPILOGUE() 182ASM_END() 183