1dnl PowerPC-64 mpn_addmul_1 and mpn_submul_1 optimised for power6. 2 3dnl Copyright 1999-2001, 2003-2006, 2008, 2010, 2011 Free Software Foundation, 4dnl Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C mpn_addmul_1 mpn_submul_1 35C cycles/limb cycles/limb 36C POWER3/PPC630 ? ? 37C POWER4/PPC970 ? ? 38C POWER5 ? ? 39C POWER6 12.25 12.8 40C POWER7 ? ? 41 42C TODO 43C * Reduce register usage. 44C * Schedule function entry code. 45C * Unroll more. 8-way unrolling would bring us to 10 c/l, 16-way unrolling 46C would bring us to 9 c/l. 47C * Handle n = 1 and perhaps n = 2 separately, without saving any registers. 48 49C INPUT PARAMETERS 50define(`rp', `r3') 51define(`up', `r4') 52define(`n', `r5') 53define(`v0', `r6') 54 55ifdef(`OPERATION_addmul_1',` 56 define(ADDSUBC, adde) 57 define(ADDSUB, addc) 58 define(func, mpn_addmul_1) 59 define(func_nc, mpn_addmul_1c) C FIXME: not really supported 60 define(AM, `$1') 61 define(SM, `') 62 define(CLRRSC, `addic $1, r0, 0') 63') 64ifdef(`OPERATION_submul_1',` 65 define(ADDSUBC, subfe) 66 define(ADDSUB, subfc) 67 define(func, mpn_submul_1) 68 define(func_nc, mpn_submul_1c) C FIXME: not really supported 69 define(AM, `') 70 define(SM, `$1') 71 define(CLRRSC, `subfc $1, r0, r0') 72') 73 74MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1) 75 76ASM_START() 77PROLOGUE(func) 78 std r31, -8(r1) 79 std r30, -16(r1) 80 std r29, -24(r1) 81 std r28, -32(r1) 82 std r27, -40(r1) 83 84 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 85 cmpdi cr6, r0, 2 86 addi n, n, 3 C compute count... 87 srdi n, n, 2 C ...for ctr 88 mtctr n C copy loop count into ctr 89 beq cr0, L(b0) 90 blt cr6, L(b1) 91 beq cr6, L(b2) 92 93L(b3): ld r8, 0(up) 94 ld r7, 8(up) 95 ld r27, 16(up) 96 addi up, up, 16 97 addi rp, rp, 16 98 mulld r5, r8, v0 99 mulhdu r8, r8, v0 100 mulld r9, r7, v0 101 mulhdu r7, r7, v0 102 mulld r11, r27, v0 103 mulhdu r27, r27, v0 104 ld r29, -16(rp) 105 ld r30, -8(rp) 106 ld r31, 0(rp) 107 addc r9, r9, r8 108 adde r11, r11, r7 109 addze r12, r27 110 ADDSUB r5, r5, r29 111 b L(l3) 112 113L(b2): ld r7, 0(up) 114 ld r27, 8(up) 115 addi up, up, 8 116 addi rp, rp, 8 117 mulld r9, r7, v0 118 mulhdu r7, r7, v0 119 mulld r11, r27, v0 120 mulhdu r27, r27, v0 121 ld r30, -8(rp) 122 ld r31, 0(rp) 123 addc r11, r11, r7 124 addze r12, r27 125 ADDSUB r9, r9, r30 126 b L(l2) 127 128L(b1): ld r27, 0(up) 129 ld r31, 0(rp) 130 mulld r11, r27, v0 131 mulhdu r12, r27, v0 132 ADDSUB r11, r11, r31 133 b L(l1) 134 135L(b0): addi up, up, -8 136 addi rp, rp, -8 137 CLRRSC( r12) C clear r12 and clr/set cy 138 139 ALIGN(32) 140L(top): 141SM(` subfe r11, r0, r0') C complement... 142SM(` addic r11, r11, 1') C ...carry flag 143 ld r10, 8(up) 144 ld r8, 16(up) 145 ld r7, 24(up) 146 ld r27, 32(up) 147 addi up, up, 32 148 addi rp, rp, 32 149 mulld r0, r10, v0 150 mulhdu r10, r10, v0 151 mulld r5, r8, v0 152 mulhdu r8, r8, v0 153 mulld r9, r7, v0 154 mulhdu r7, r7, v0 155 mulld r11, r27, v0 156 mulhdu r27, r27, v0 157 ld r28, -24(rp) 158 adde r0, r0, r12 159 ld r29, -16(rp) 160 adde r5, r5, r10 161 ld r30, -8(rp) 162 ld r31, 0(rp) 163 adde r9, r9, r8 164 adde r11, r11, r7 165 addze r12, r27 166 ADDSUB r0, r0, r28 167 std r0, -24(rp) 168 ADDSUBC r5, r5, r29 169L(l3): std r5, -16(rp) 170 ADDSUBC r9, r9, r30 171L(l2): std r9, -8(rp) 172 ADDSUBC r11, r11, r31 173L(l1): std r11, 0(rp) 174 bdnz L(top) 175 176AM(` addze r3, r12') 177SM(` subfe r11, r0, r0') C complement... 178 ld r31, -8(r1) 179SM(` subf r3, r11, r12') 180 ld r30, -16(r1) 181 ld r29, -24(r1) 182 ld r28, -32(r1) 183 ld r27, -40(r1) 184 blr 185EPILOGUE() 186