1dnl Alpha ev6 nails mpn_addmul_4. 2 3dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C Runs at 2.5 cycles/limb. 23 24C We should go for 2-way unrolling over 17 cycles, for 2.125 c/l corresponding 25C to 3.24 insn/cycle. 26 27 28C INPUT PARAMETERS 29define(`rp',`r16') 30define(`up',`r17') 31define(`n',`r18') 32define(`vp',`r19') 33 34C Useful register aliases 35define(`numb_mask',`r24') 36define(`ulimb',`r25') 37define(`rlimb',`r27') 38 39define(`m0a',`r0') 40define(`m0b',`r1') 41define(`m1a',`r2') 42define(`m1b',`r3') 43define(`m2a',`r20') 44define(`m2b',`r21') 45define(`m3a',`r12') 46define(`m3b',`r13') 47 48define(`acc0',`r4') 49define(`acc1',`r5') 50define(`acc2',`r22') 51define(`acc3',`r14') 52 53define(`v0',`r6') 54define(`v1',`r7') 55define(`v2',`r23') 56define(`v3',`r15') 57 58C Used for temps: r8 r19 r28 59 60define(`NAIL_BITS',`GMP_NAIL_BITS') 61define(`NUMB_BITS',`GMP_NUMB_BITS') 62 63C This declaration is munged by configure 64NAILS_SUPPORT(4-63) 65 66ASM_START() 67PROLOGUE(mpn_addmul_4) 68 lda r30, -240(r30) 69 stq r12, 32(r30) 70 stq r13, 40(r30) 71 stq r14, 48(r30) 72 stq r15, 56(r30) 73 74 lda numb_mask,-1(r31) 75 srl numb_mask,NAIL_BITS,numb_mask 76 77 ldq v0, 0(vp) 78 ldq v1, 8(vp) 79 ldq v2, 16(vp) 80 ldq v3, 24(vp) 81 82 bis r31, r31, acc0 C zero acc0 83 sll v0,NAIL_BITS, v0 84 bis r31, r31, acc1 C zero acc1 85 sll v1,NAIL_BITS, v1 86 bis r31, r31, acc2 C zero acc2 87 sll v2,NAIL_BITS, v2 88 bis r31, r31, acc3 C zero acc3 89 sll v3,NAIL_BITS, v3 90 bis r31, r31, r19 91 92 ldq ulimb, 0(up) 93 lda up, 8(up) 94 mulq v0, ulimb, m0a C U1 95 umulh v0, ulimb, m0b C U1 96 mulq v1, ulimb, m1a C U1 97 umulh v1, ulimb, m1b C U1 98 lda n, -1(n) 99 mulq v2, ulimb, m2a C U1 100 umulh v2, ulimb, m2b C U1 101 mulq v3, ulimb, m3a C U1 102 umulh v3, ulimb, m3b C U1 103 beq n, L(end) C U0 104 105 ALIGN(16) 106L(top): bis r31, r31, r31 C U1 nop 107 ldq rlimb, 0(rp) C L0 108 ldq ulimb, 0(up) C L1 109 addq r19, acc0, acc0 C U0 propagate nail 110 111 bis r31, r31, r31 C L0 nop 112 bis r31, r31, r31 C U1 nop 113 bis r31, r31, r31 C L1 nop 114 bis r31, r31, r31 C U0 nop 115 116 lda rp, 8(rp) C L0 117 srl m0a,NAIL_BITS, r8 C U0 118 lda up, 8(up) C L1 119 mulq v0, ulimb, m0a C U1 120 121 addq r8, acc0, r19 C U0 122 addq m0b, acc1, acc0 C L0 123 umulh v0, ulimb, m0b C U1 124 bis r31, r31, r31 C L1 nop 125 126 addq rlimb, r19, r19 C L0 127 srl m1a,NAIL_BITS, r8 C U0 128 bis r31, r31, r31 C L1 nop 129 mulq v1, ulimb, m1a C U1 130 131 addq r8, acc0, acc0 C U0 132 addq m1b, acc2, acc1 C L0 133 umulh v1, ulimb, m1b C U1 134 and r19,numb_mask, r28 C L1 extract numb part 135 136 bis r31, r31, r31 C L0 nop 137 srl m2a,NAIL_BITS, r8 C U0 138 lda n, -1(n) C L1 139 mulq v2, ulimb, m2a C U1 140 141 addq r8, acc1, acc1 C L1 142 addq m2b, acc3, acc2 C L0 143 umulh v2, ulimb, m2b C U1 144 srl r19,NUMB_BITS, r19 C U0 extract nail part 145 146 bis r31, r31, r31 C L0 nop 147 srl m3a,NAIL_BITS, r8 C U0 148 stq r28, -8(rp) C L1 149 mulq v3, ulimb, m3a C U1 150 151 addq r8, acc2, acc2 C L0 152 bis r31, m3b, acc3 C L1 153 umulh v3, ulimb, m3b C U1 154 bne n, L(top) C U0 155 156L(end): ldq rlimb, 0(rp) 157 addq r19, acc0, acc0 C propagate nail 158 lda rp, 8(rp) C FIXME: DELETE 159 srl m0a,NAIL_BITS, r8 C U0 160 addq r8, acc0, r19 161 addq m0b, acc1, acc0 162 addq rlimb, r19, r19 163 srl m1a,NAIL_BITS, r8 C U0 164 addq r8, acc0, acc0 165 addq m1b, acc2, acc1 166 and r19,numb_mask, r28 C extract limb 167 srl m2a,NAIL_BITS, r8 C U0 168 addq r8, acc1, acc1 169 addq m2b, acc3, acc2 170 srl r19,NUMB_BITS, r19 C extract nail 171 srl m3a,NAIL_BITS, r8 C U0 172 stq r28, -8(rp) 173 addq r8, acc2, acc2 174 bis r31, m3b, acc3 175 176 addq r19, acc0, acc0 C propagate nail 177 and acc0,numb_mask, r28 178 stq r28, 0(rp) 179 srl acc0,NUMB_BITS, r19 180 addq r19, acc1, acc1 181 182 and acc1,numb_mask, r28 183 stq r28, 8(rp) 184 srl acc1,NUMB_BITS, r19 185 addq r19, acc2, acc2 186 187 and acc2,numb_mask, r28 188 stq r28, 16(rp) 189 srl acc2,NUMB_BITS, r19 190 addq r19, acc3, r0 191 192 ldq r12, 32(r30) 193 ldq r13, 40(r30) 194 ldq r14, 48(r30) 195 ldq r15, 56(r30) 196 lda r30, 240(r30) 197 ret r31, (r26), 1 198EPILOGUE() 199ASM_END() 200