1dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add the 2dnl result to a second limb vector. 3 4dnl Copyright 1995, 1997, 1998, 2000-2003, 2005 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7dnl 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of either: 10dnl 11dnl * the GNU Lesser General Public License as published by the Free 12dnl Software Foundation; either version 3 of the License, or (at your 13dnl option) any later version. 14dnl 15dnl or 16dnl 17dnl * the GNU General Public License as published by the Free Software 18dnl Foundation; either version 2 of the License, or (at your option) any 19dnl later version. 20dnl 21dnl or both in parallel, as here. 22dnl 23dnl The GNU MP Library is distributed in the hope that it will be useful, but 24dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 25dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 26dnl for more details. 27dnl 28dnl You should have received copies of the GNU General Public License and the 29dnl GNU Lesser General Public License along with the GNU MP Library. If not, 30dnl see https://www.gnu.org/licenses/. 31 32include(`../config.m4') 33 34C cycles/limb 35C 603e: ? 36C 604e: 6.75 37C 75x (G3): 8.7-14.3 38C 7400,7410 (G4): 8.7-14.3 39C 744x,745x (G4+): 9.5 40C power4/ppc970: 6.25 41C power5: 6.25 42 43C INPUT PARAMETERS 44C rp r3 45C up r4 46C n r5 47C vl r6 48 49C This is optimized for the PPC604. It has not been tuned for other 50C PowerPC processors. 51C 52C Loop Analysis for the 604: 53C 12 mem insn 54C 8 serializing insn 55C 8 int multiply 56C 25 int reg write 57C 9 int ops (8 of which serialize) 58C 59C The multiply insns need 16 cycles/4limb. 60C The integer register writes will need 13 cycles/4limb. 61C All-in-all, it should be possible to get to 4 or 5 cycles/limb on PPC604, 62C but that will require some clever FPNOPS and BNOPS for exact 63C issue control. 64 65 66ASM_START() 67PROLOGUE(mpn_addmul_1) 68 cmpwi cr0,r5,9 C more than 9 limbs? 69 bgt cr0,L(big) C branch if more than 9 limbs 70 71 mtctr r5 72 lwz r0,0(r4) 73 mullw r7,r0,r6 74 mulhwu r10,r0,r6 75 lwz r9,0(r3) 76 addc r8,r7,r9 77 addi r3,r3,-4 78 bdz L(end) 79L(loop): 80 lwzu r0,4(r4) 81 stwu r8,4(r3) 82 mullw r8,r0,r6 83 adde r7,r8,r10 84 mulhwu r10,r0,r6 85 lwz r9,4(r3) 86 addze r10,r10 87 addc r8,r7,r9 88 bdnz L(loop) 89L(end): stw r8,4(r3) 90 addze r3,r10 91 blr 92 93L(big): stwu r1,-16(r1) 94 addi r5,r5,-1 95 stw r30,8(r1) 96 srwi r0,r5,2 97 stw r31,12(r1) 98 mtctr r0 99 100 lwz r7,0(r4) 101 mullw r8,r7,r6 102 mulhwu r0,r7,r6 103 lwz r7,0(r3) 104 addc r8,r8,r7 105 stw r8,0(r3) 106 107L(loopU): 108 lwz r7,4(r4) 109 lwz r12,8(r4) 110 lwz r30,12(r4) 111 lwzu r31,16(r4) 112 mullw r8,r7,r6 113 mullw r9,r12,r6 114 mullw r10,r30,r6 115 mullw r11,r31,r6 116 adde r8,r8,r0 C add cy_limb 117 mulhwu r0,r7,r6 118 lwz r7,4(r3) 119 adde r9,r9,r0 120 mulhwu r0,r12,r6 121 lwz r12,8(r3) 122 adde r10,r10,r0 123 mulhwu r0,r30,r6 124 lwz r30,12(r3) 125 adde r11,r11,r0 126 mulhwu r0,r31,r6 127 lwz r31,16(r3) 128 addze r0,r0 C new cy_limb 129 addc r8,r8,r7 130 stw r8,4(r3) 131 adde r9,r9,r12 132 stw r9,8(r3) 133 adde r10,r10,r30 134 stw r10,12(r3) 135 adde r11,r11,r31 136 stwu r11,16(r3) 137 bdnz L(loopU) 138 139 andi. r31,r5,3 140 mtctr r31 141 beq cr0,L(endx) 142 143L(loopE): 144 lwzu r7,4(r4) 145 mullw r8,r7,r6 146 adde r8,r8,r0 C add cy_limb 147 mulhwu r0,r7,r6 148 lwz r7,4(r3) 149 addze r0,r0 C new cy_limb 150 addc r8,r8,r7 151 stwu r8,4(r3) 152 bdnz L(loopE) 153L(endx): 154 addze r3,r0 155 lwz r30,8(r1) 156 lwz r31,12(r1) 157 addi r1,r1,16 158 blr 159EPILOGUE(mpn_addmul_1) 160