1dnl Power9 mpn_mul_2. 2 3dnl Contributed to the GNU project by Torbjörn Granlund. 4 5dnl Copyright 2018 Free Software Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8dnl 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of either: 11dnl 12dnl * the GNU Lesser General Public License as published by the Free 13dnl Software Foundation; either version 3 of the License, or (at your 14dnl option) any later version. 15dnl 16dnl or 17dnl 18dnl * the GNU General Public License as published by the Free Software 19dnl Foundation; either version 2 of the License, or (at your option) any 20dnl later version. 21dnl 22dnl or both in parallel, as here. 23dnl 24dnl The GNU MP Library is distributed in the hope that it will be useful, but 25dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 26dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 27dnl for more details. 28dnl 29dnl You should have received copies of the GNU General Public License and the 30dnl GNU Lesser General Public License along with the GNU MP Library. If not, 31dnl see https://www.gnu.org/licenses/. 32 33include(`../config.m4') 34 35C cycles/limb 36C power9: 1.58 37 38C STATUS 39C * Not written with any power9 pipeline understanding. 40C * The 4x unrolling was not motivated by any timing tests. 41C * No local scheduling for performance tweaking has been done. 42C * Decrease load scheduling! 43 44define(`rp', `r3') 45define(`up', `r4') 46define(`n', `r5') C Note: Reused as scratch 47define(`vp', `r6') C Note: Reused for v1 48 49define(`v0', `r7') 50define(`v1', `r6') 51 52 53ASM_START() 54PROLOGUE(mpn_mul_2) 55 std r28, -32(r1) 56 std r29, -24(r1) 57 std r30, -16(r1) 58 std r31, -8(r1) 59 60 subfic r0, n, 0 C clear CA 61 subfo r0, r0, r0 C clear OV and r0 62 63 cmpdi cr7, n, 4 64 65 ld v0, 0(vp) 66 ld v1, 8(vp) 67 68 srdi r10, n, 2 69 mtctr r10 70 71 rldicl. r9, n, 0, 63 72 bne cr0, L(bx1) 73 74L(bx0): rldicl. r9, n, 63, 63 75 76 ld r8, 0(up) 77 ld r9, 8(up) 78 li r11, 0 79 mulld r28, r8, v0 80 mulhdu r31, r8, v0 81 blt cr7, L(2) 82 mulld r5, r8, v1 83 mulhdu r10, r8, v1 84 bne cr0, L(b10) 85 86L(b00): addi up, up, -8 87 addi rp, rp, -24 88 b L(lo0) 89 90L(b10): addi up, up, 8 91 addi rp, rp, -8 92 b L(lo2) 93 94L(2): addi rp, rp, -8 95 mulld r5, r8, v1 96 mulhdu r10, r8, v1 97 b L(cj2) 98 99L(bx1): rldicl. r9, n, 63, 63 100 101 ld r9, 0(up) 102 ld r8, 8(up) 103 li r10, 0 104 mulld r29, r9, v0 105 mulhdu r30, r9, v0 106 mulld r12, r9, v1 107 mulhdu r11, r9, v1 108 bne cr0, L(b11) 109 110L(b01): addi rp, rp, -16 111 b L(lo1) 112L(b11): addi up, up, 16 113 blt cr7, L(end) 114 115L(top): ld r9, 0(up) 116 maddld( r28, r8, v0, r10) C 0 4 -> adde 117 maddhdu(r31, r8, v0, r10) C 1 5 118 adde r0, r29, r0 C 7 11 119 std r0, 0(rp) 120 mulld r5, r8, v1 C 1 5 -> addex 121 mulhdu r10, r8, v1 C 2 6 122 addex( r0, r12, r30, 0) C 8 12 123L(lo2): ld r8, 8(up) 124 maddld( r29, r9, v0, r11) C 1 5 -> adde 125 maddhdu(r30, r9, v0, r11) C 2 6 126 adde r0, r28, r0 C 8 12 127 std r0, 8(rp) 128 mulld r12, r9, v1 C 2 6 -> addex 129 mulhdu r11, r9, v1 C 3 7 130 addex( r0, r5, r31, 0) C 5 9 13 131L(lo1): ld r9, 16(up) 132 maddld( r28, r8, v0, r10) C 2 6 -> adde 133 maddhdu(r31, r8, v0, r10) C 3 7 134 adde r0, r29, r0 C 5 9 13 135 std r0, 16(rp) 136 mulld r5, r8, v1 C 3 7 -> addex 137 mulhdu r10, r8, v1 C 4 8 138 addex( r0, r12, r30, 0) C 6 10 139L(lo0): ld r8, 24(up) 140 maddld( r29, r9, v0, r11) C 3 7 -> adde 141 maddhdu(r30, r9, v0, r11) C 4 8 142 adde r0, r28, r0 C 6 10 143 std r0, 24(rp) 144 mulld r12, r9, v1 C 4 8 -> addex 145 mulhdu r11, r9, v1 C 5 9 146 addex( r0, r5, r31, 0) C 7 11 147 addi up, up, 32 148 addi rp, rp, 32 149 bdnz L(top) 150 151L(end): ld r9, 0(up) 152 maddld( r28, r8, v0, r10) C 0 4 153 maddhdu(r31, r8, v0, r10) C 1 5 154 adde r0, r29, r0 C 7 11 155 std r0, 0(rp) C -4 156 mulld r5, r8, v1 C 1 5 157 mulhdu r10, r8, v1 C 2 6 158 addex( r0, r12, r30, 0) C 8 12 159L(cj2): maddld( r29, r9, v0, r11) C 1 5 -2 160 maddhdu(r30, r9, v0, r11) C 2 6 -1 161 adde r0, r28, r0 C 8 12 -3 162 std r0, 8(rp) C -3 163 mulld r12, r9, v1 C 2 6 -1 164 mulhdu r11, r9, v1 C 3 7 0 = return limb 165 addex( r0, r5, r31, 0) C 5 9 13 166 adde r0, r29, r0 C 5 9 13 -2 167 std r0, 16(rp) C -2 168 addex( r0, r12, r30, 0) C 6 10 -1 169 adde r0, r0, r10 C -1 170 std r0, 24(rp) C -1 171 li r4, 0 172 addze r3, r11 173 addex( r3, r3, r4, 0) 174 175L(ret): ld r28, -32(r1) 176 ld r29, -24(r1) 177 ld r30, -16(r1) 178 ld r31, -8(r1) 179 blr 180EPILOGUE() 181ASM_END() 182