1dnl PowerPC-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add 2dnl the result to a second limb vector. 3 4dnl Copyright 1999, 2000, 2001, 2003, 2004, 2005, 2006 Free Software 5dnl Foundation, Inc. 6 7dnl This file is part of the GNU MP Library. 8 9dnl The GNU MP Library is free software; you can redistribute it and/or modify 10dnl it under the terms of the GNU Lesser General Public License as published 11dnl by the Free Software Foundation; either version 3 of the License, or (at 12dnl your option) any later version. 13 14dnl The GNU MP Library is distributed in the hope that it will be useful, but 15dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17dnl License for more details. 18 19dnl You should have received a copy of the GNU Lesser General Public License 20dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 21 22include(`../config.m4') 23 24C cycles/limb 25C POWER3/PPC630: 6-18 26C POWER4/PPC970: 8 27C POWER5: 8 28 29C TODO 30C * Reduce the number of registers used. Some mul destination registers could 31C be coalesced. 32C * Delay std for preserving registers, and suppress them for n=1. 33C * Write faster feed-in code. If nothing else, avoid one or two up updates. 34 35C INPUT PARAMETERS 36define(`rp', `r3') 37define(`up', `r4') 38define(`n', `r5') 39define(`vl', `r6') 40 41ASM_START() 42PROLOGUE(mpn_addmul_1) 43 std r31, -8(r1) 44 std r30, -16(r1) 45 std r29, -24(r1) 46 std r28, -32(r1) 47 std r27, -40(r1) 48 std r26, -48(r1) 49 50 rldicl. r0, n, 0,62 C r0 = n & 3, set cr0 51 cmpdi cr6, r0, 2 52 addi n, n, 3 C compute count... 53 srdi n, n, 2 C ...for ctr 54 mtctr n C copy count into ctr 55 beq cr0, L(b00) 56 blt cr6, L(b01) 57 beq cr6, L(b10) 58 59L(b11): ld r26, 0(up) 60 ld r28, 0(rp) 61 addi up, up, 8 62 nop 63 mulld r0, r26, r6 64 mulhdu r12, r26, r6 65 addc r0, r0, r28 66 std r0, 0(rp) 67 addi rp, rp, 8 68 b L(fic) 69 70L(b00): ld r26, 0(up) 71 ld r27, 8(up) 72 ld r28, 0(rp) 73 ld r29, 8(rp) 74 addi up, up, 16 75 nop 76 mulld r0, r26, r6 77 mulhdu r5, r26, r6 78 mulld r7, r27, r6 79 mulhdu r8, r27, r6 80 addc r7, r7, r5 81 addze r12, r8 82 addc r0, r0, r28 83 std r0, 0(rp) 84 adde r7, r7, r29 85 std r7, 8(rp) 86 addi rp, rp, 16 87 b L(fic) 88 89L(b01): bdnz L(gt1) 90 ld r26, 0(up) 91 ld r28, 0(rp) 92 mulld r0, r26, r6 93 mulhdu r8, r26, r6 94 addc r0, r0, r28 95 std r0, 0(rp) 96 b L(ret) 97L(gt1): ld r26, 0(up) 98 ld r27, 8(up) 99 mulld r0, r26, r6 100 mulhdu r5, r26, r6 101 ld r26, 16(up) 102 ld r28, 0(rp) 103 mulld r7, r27, r6 104 mulhdu r8, r27, r6 105 ld r29, 8(rp) 106 ld r30, 16(rp) 107 mulld r9, r26, r6 108 mulhdu r10, r26, r6 109 addc r7, r7, r5 110 adde r9, r9, r8 111 addze r12, r10 112 addc r0, r0, r28 113 std r0, 0(rp) 114 adde r7, r7, r29 115 std r7, 8(rp) 116 adde r9, r9, r30 117 std r9, 16(rp) 118 addi up, up, 24 119 addi rp, rp, 24 120 b L(fic) 121 122L(b10): addic r0, r0, 0 123 li r12, 0 C cy_limb = 0 124L(fic): ld r26, 0(up) 125 ld r27, 8(up) 126 addi up, up, 16 127 bdz L(end) 128 C registers dying 129L(top): mulld r0, r26, r6 C 130 mulhdu r5, r26, r6 C 26 131 ld r26, 0(up) C 132 ld r28, 0(rp) C 133 mulld r7, r27, r6 C 134 mulhdu r8, r27, r6 C 27 135 ld r27, 8(up) C 136 ld r29, 8(rp) C 137 adde r0, r0, r12 C 0 12 138 adde r7, r7, r5 C 5 7 139 mulld r9, r26, r6 C 140 mulhdu r10, r26, r6 C 26 141 ld r26, 16(up) C 142 ld r30, 16(rp) C 143 mulld r11, r27, r6 C 144 mulhdu r12, r27, r6 C 27 145 ld r27, 24(up) C 146 ld r31, 24(rp) C 147 adde r9, r9, r8 C 8 9 148 adde r11, r11, r10 C 10 11 149 addze r12, r12 C 12 150 addc r0, r0, r28 C 0 28 151 std r0, 0(rp) C 0 152 adde r7, r7, r29 C 7 29 153 std r7, 8(rp) C 7 154 adde r9, r9, r30 C 9 30 155 std r9, 16(rp) C 9 156 adde r11, r11, r31 C 11 31 157 std r11, 24(rp) C 11 158 addi up, up, 32 C 159 addi rp, rp, 32 C 160 bdnz L(top) C 161 162L(end): mulld r0, r26, r6 163 mulhdu r5, r26, r6 164 ld r28, 0(rp) 165 nop 166 mulld r7, r27, r6 167 mulhdu r8, r27, r6 168 ld r29, 8(rp) 169 nop 170 adde r0, r0, r12 171 adde r7, r7, r5 172 addze r8, r8 173 addc r0, r0, r28 174 std r0, 0(rp) 175 adde r7, r7, r29 176 std r7, 8(rp) 177L(ret): addze r3, r8 178 ld r31, -8(r1) 179 ld r30, -16(r1) 180 ld r29, -24(r1) 181 ld r28, -32(r1) 182 ld r27, -40(r1) 183 ld r26, -48(r1) 184 blr 185EPILOGUE() 186