1dnl Alpha ev6 nails mpn_addmul_3. 2 3dnl Copyright 2002, 2006 Free Software Foundation, Inc. 4dnl 5dnl This file is part of the GNU MP Library. 6dnl 7dnl The GNU MP Library is free software; you can redistribute it and/or 8dnl modify it under the terms of the GNU Lesser General Public License as 9dnl published by the Free Software Foundation; either version 3 of the 10dnl License, or (at your option) any later version. 11dnl 12dnl The GNU MP Library is distributed in the hope that it will be useful, 13dnl but WITHOUT ANY WARRANTY; without even the implied warranty of 14dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15dnl Lesser General Public License for more details. 16dnl 17dnl You should have received a copy of the GNU Lesser General Public License 18dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 19 20include(`../config.m4') 21 22C Runs at 3.0 cycles/limb. 23 24C With 2-way unrolling, we could probably reach 2.25 c/l (3.33 i/c). 25 26 27C INPUT PARAMETERS 28define(`rp',`r16') 29define(`up',`r17') 30define(`n',`r18') 31define(`vp',`r19') 32 33C Useful register aliases 34define(`numb_mask',`r24') 35define(`ulimb',`r25') 36define(`rlimb',`r27') 37 38define(`m0a',`r0') 39define(`m0b',`r1') 40define(`m1a',`r2') 41define(`m1b',`r3') 42define(`m2a',`r20') 43define(`m2b',`r21') 44 45define(`acc0',`r4') 46define(`acc1',`r5') 47define(`acc2',`r22') 48 49define(`v0',`r6') 50define(`v1',`r7') 51define(`v2',`r23') 52 53C Used for temps: r8 r19 r28 54 55define(`NAIL_BITS',`GMP_NAIL_BITS') 56define(`NUMB_BITS',`GMP_NUMB_BITS') 57 58C This declaration is munged by configure 59NAILS_SUPPORT(3-63) 60 61ASM_START() 62PROLOGUE(mpn_addmul_3) 63 lda numb_mask,-1(r31) 64 srl numb_mask,NAIL_BITS,numb_mask 65 66 ldq v0, 0(vp) 67 ldq v1, 8(vp) 68 ldq v2, 16(vp) 69 70 bis r31, r31, acc0 C zero acc0 71 sll v0,NAIL_BITS, v0 72 bis r31, r31, acc1 C zero acc1 73 sll v1,NAIL_BITS, v1 74 bis r31, r31, acc2 C zero acc2 75 sll v2,NAIL_BITS, v2 76 bis r31, r31, r19 77 78 ldq ulimb, 0(up) 79 lda up, 8(up) 80 mulq v0, ulimb, m0a C U1 81 umulh v0, ulimb, m0b C U1 82 mulq v1, ulimb, m1a C U1 83 umulh v1, ulimb, m1b C U1 84 lda n, -1(n) 85 mulq v2, ulimb, m2a C U1 86 umulh v2, ulimb, m2b C U1 87 beq n, L(end) C U0 88 89 ALIGN(16) 90L(top): ldq rlimb, 0(rp) C L1 91 ldq ulimb, 0(up) C L0 92 bis r31, r31, r31 C U0 nop 93 addq r19, acc0, acc0 C U1 propagate nail 94 95 lda rp, 8(rp) C L1 96 srl m0a,NAIL_BITS, r8 C U0 97 lda up, 8(up) C L0 98 mulq v0, ulimb, m0a C U1 99 100 addq r8, acc0, r19 C U0 101 addq m0b, acc1, acc0 C L1 102 umulh v0, ulimb, m0b C U1 103 bis r31, r31, r31 C L0 nop 104 105 addq rlimb, r19, r19 C L1 106 srl m1a,NAIL_BITS, r8 C U0 107 bis r31, r31, r31 C L0 nop 108 mulq v1, ulimb, m1a C U1 109 110 addq r8, acc0, acc0 C U0 111 addq m1b, acc2, acc1 C L1 112 umulh v1, ulimb, m1b C U1 113 and r19,numb_mask, r28 C L0 extract numb part 114 115 bis r31, r31, r31 C L1 nop 116 srl m2a,NAIL_BITS, r8 C U0 117 lda n, -1(n) C L0 118 mulq v2, ulimb, m2a C U1 119 120 addq r8, acc1, acc1 C L0 121 bis r31, m2b, acc2 C L1 122 umulh v2, ulimb, m2b C U1 123 srl r19,NUMB_BITS, r19 C U0 extract nail part 124 125 stq r28, -8(rp) C L 126 bne n, L(top) C U0 127 128L(end): ldq rlimb, 0(rp) 129 addq r19, acc0, acc0 C propagate nail 130 lda rp, 8(rp) 131 srl m0a,NAIL_BITS, r8 C U0 132 addq r8, acc0, r19 133 addq m0b, acc1, acc0 134 addq rlimb, r19, r19 135 srl m1a,NAIL_BITS, r8 C U0 136 addq r8, acc0, acc0 137 addq m1b, acc2, acc1 138 and r19,numb_mask, r28 C extract limb 139 srl m2a,NAIL_BITS, r8 C U0 140 addq r8, acc1, acc1 141 bis r31, m2b, acc2 142 srl r19,NUMB_BITS, r19 C extract nail 143 stq r28, -8(rp) 144 145 addq r19, acc0, acc0 C propagate nail 146 and acc0,numb_mask, r28 147 stq r28, 0(rp) 148 srl acc0,NUMB_BITS, r19 149 addq r19, acc1, acc1 150 151 and acc1,numb_mask, r28 152 stq r28, 8(rp) 153 srl acc1,NUMB_BITS, r19 154 addq r19, acc2, m0a 155 156 ret r31, (r26), 1 157EPILOGUE() 158ASM_END() 159