1dnl Alpha ev6 mpn_add_n -- Add two limb vectors of the same length > 0 and 2dnl store sum in a third limb vector. 3 4dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C EV4: ? 25C EV5: 5.4 26C EV6: 2.125 27 28C INPUT PARAMETERS 29C rp r16 30C up r17 31C vp r18 32C n r19 33C cy r20 (for mpn_add_nc) 34 35C TODO 36C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) 37C Use multi-pronged feed-in. 38C Perform additional micro-tuning 39 40C This code was written in cooperation with ev6 pipeline expert Steve Root. 41 42C Pair loads and stores where possible 43C Store pairs oct-aligned where possible (didn't need it here) 44C Stores are delayed every third cycle 45C Loads and stores are delayed by fills 46C U stays still, put code there where possible (note alternation of U1 and U0) 47C L moves because of loads and stores 48C Note dampers in L to limit damage 49 50C This odd-looking optimization expects that were having random bits in our 51C data, so that a pure zero result is unlikely. so we penalize the unlikely 52C case to help the common case. 53 54define(`u0', `r0') define(`u1', `r3') 55define(`v0', `r1') define(`v1', `r4') 56 57define(`cy0', `r20') define(`cy1', `r21') 58 59MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc) 60 61ASM_START() 62PROLOGUE(mpn_add_nc) 63 br r31, $entry 64EPILOGUE() 65PROLOGUE(mpn_add_n) 66 bis r31, r31, cy0 C clear carry in 67$entry: cmpult r19, 5, r22 C L1 move counter 68 ldq u1, 0(r17) C L0 get next ones 69 ldq v1, 0(r18) C L1 70 bne r22, $Lsmall 71 72 ldq u0, 8(r17) C L0 get next ones 73 ldq v0, 8(r18) C L1 74 addq u1, v1, r5 C U0 add two data 75 76 cmpult r5, v1, r23 C U0 did it carry 77 ldq u1, 16(r17) C L0 get next ones 78 ldq v1, 16(r18) C L1 79 80 addq u0, v0, r8 C U1 add two data 81 addq r5, cy0, r5 C U0 carry in 82 83 cmpult r8, v0, r22 C U1 did it carry 84 beq r5, $fix5f C U0 fix exact zero 85$ret5f: ldq u0, 24(r17) C L0 get next ones 86 ldq v0, 24(r18) C L1 87 88 addq r8, r23, r8 C U1 carry from last 89 addq u1, v1, r7 C U0 add two data 90 91 beq r8, $fix6f C U1 fix exact zero 92$ret6f: cmpult r7, v1, r23 C U0 did it carry 93 ldq u1, 32(r17) C L0 get next ones 94 ldq v1, 32(r18) C L1 95 96 lda r17, 40(r17) C L0 move pointer 97 lda r18, 40(r18) C L1 move pointer 98 99 lda r16, -8(r16) 100 lda r19, -13(r19) C L1 move counter 101 blt r19, $Lend C U1 loop control 102 103 104C Main loop. 8-way unrolled. 105 ALIGN(16) 106$Loop: addq u0, v0, r2 C U1 add two data 107 addq r7, r22, r7 C U0 add in carry 108 stq r5, 8(r16) C L0 put an answer 109 stq r8, 16(r16) C L1 pair 110 111 cmpult r2, v0, cy1 C U1 did it carry 112 beq r7, $fix7 C U0 fix exact 0 113$ret7: ldq u0, 0(r17) C L0 get next ones 114 ldq v0, 0(r18) C L1 115 116 bis r31, r31, r31 C L damp out 117 addq r2, r23, r2 C U1 carry from last 118 bis r31, r31, r31 C L moves in L ! 119 addq u1, v1, r5 C U0 add two data 120 121 beq r2, $fix0 C U1 fix exact zero 122$ret0: cmpult r5, v1, cy0 C U0 did it carry 123 ldq u1, 8(r17) C L0 get next ones 124 ldq v1, 8(r18) C L1 125 126 addq u0, v0, r8 C U1 add two data 127 addq r5, cy1, r5 C U0 carry from last 128 stq r7, 24(r16) C L0 store pair 129 stq r2, 32(r16) C L1 130 131 cmpult r8, v0, r22 C U1 did it carry 132 beq r5, $fix1 C U0 fix exact zero 133$ret1: ldq u0, 16(r17) C L0 get next ones 134 ldq v0, 16(r18) C L1 135 136 lda r16, 64(r16) C L0 move pointer 137 addq r8, cy0, r8 C U1 carry from last 138 lda r19, -8(r19) C L1 move counter 139 addq u1, v1, r7 C U0 add two data 140 141 beq r8, $fix2 C U1 fix exact zero 142$ret2: cmpult r7, v1, r23 C U0 did it carry 143 ldq u1, 24(r17) C L0 get next ones 144 ldq v1, 24(r18) C L1 145 146 addq u0, v0, r2 C U1 add two data 147 addq r7, r22, r7 C U0 add in carry 148 stq r5, -24(r16) C L0 put an answer 149 stq r8, -16(r16) C L1 pair 150 151 cmpult r2, v0, cy1 C U1 did it carry 152 beq r7, $fix3 C U0 fix exact 0 153$ret3: ldq u0, 32(r17) C L0 get next ones 154 ldq v0, 32(r18) C L1 155 156 bis r31, r31, r31 C L damp out 157 addq r2, r23, r2 C U1 carry from last 158 bis r31, r31, r31 C L moves in L ! 159 addq u1, v1, r5 C U0 add two data 160 161 beq r2, $fix4 C U1 fix exact zero 162$ret4: cmpult r5, v1, cy0 C U0 did it carry 163 ldq u1, 40(r17) C L0 get next ones 164 ldq v1, 40(r18) C L1 165 166 addq u0, v0, r8 C U1 add two data 167 addq r5, cy1, r5 C U0 carry from last 168 stq r7, -8(r16) C L0 store pair 169 stq r2, 0(r16) C L1 170 171 cmpult r8, v0, r22 C U1 did it carry 172 beq r5, $fix5 C U0 fix exact zero 173$ret5: ldq u0, 48(r17) C L0 get next ones 174 ldq v0, 48(r18) C L1 175 176 ldl r31, 256(r17) C L0 prefetch 177 addq r8, cy0, r8 C U1 carry from last 178 ldl r31, 256(r18) C L1 prefetch 179 addq u1, v1, r7 C U0 add two data 180 181 beq r8, $fix6 C U1 fix exact zero 182$ret6: cmpult r7, v1, r23 C U0 did it carry 183 ldq u1, 56(r17) C L0 get next ones 184 ldq v1, 56(r18) C L1 185 186 lda r17, 64(r17) C L0 move pointer 187 bis r31, r31, r31 C U 188 lda r18, 64(r18) C L1 move pointer 189 bge r19, $Loop C U1 loop control 190C ==== main loop end 191 192$Lend: addq u0, v0, r2 C U1 add two data 193 addq r7, r22, r7 C U0 add in carry 194 stq r5, 8(r16) C L0 put an answer 195 stq r8, 16(r16) C L1 pair 196 cmpult r2, v0, cy1 C U1 did it carry 197 beq r7, $fix7c C U0 fix exact 0 198$ret7c: addq r2, r23, r2 C U1 carry from last 199 addq u1, v1, r5 C U0 add two data 200 beq r2, $fix0c C U1 fix exact zero 201$ret0c: cmpult r5, v1, cy0 C U0 did it carry 202 addq r5, cy1, r5 C U0 carry from last 203 stq r7, 24(r16) C L0 store pair 204 stq r2, 32(r16) C L1 205 beq r5, $fix1c C U0 fix exact zero 206$ret1c: stq r5, 40(r16) C L0 put an answer 207 lda r16, 48(r16) C L0 move pointer 208 209 lda r19, 8(r19) 210 beq r19, $Lret 211 212 ldq u1, 0(r17) 213 ldq v1, 0(r18) 214$Lsmall: 215 lda r19, -1(r19) 216 beq r19, $Lend0 217 218 ALIGN(8) 219$Loop0: addq u1, v1, r2 C main add 220 cmpult r2, v1, r8 C compute cy from last add 221 ldq u1, 8(r17) 222 ldq v1, 8(r18) 223 addq r2, cy0, r5 C carry add 224 lda r17, 8(r17) 225 lda r18, 8(r18) 226 stq r5, 0(r16) 227 cmpult r5, r2, cy0 C compute cy from last add 228 lda r19, -1(r19) C decr loop cnt 229 bis r8, cy0, cy0 C combine cy from the two adds 230 lda r16, 8(r16) 231 bne r19, $Loop0 232$Lend0: addq u1, v1, r2 C main add 233 addq r2, cy0, r5 C carry add 234 cmpult r2, v1, r8 C compute cy from last add 235 cmpult r5, r2, cy0 C compute cy from last add 236 stq r5, 0(r16) 237 bis r8, cy0, r0 C combine cy from the two adds 238 ret r31,(r26),1 239 240 ALIGN(8) 241$Lret: lda r0, 0(cy0) C copy carry into return register 242 ret r31,(r26),1 243 244$fix5f: bis r23, cy0, r23 C bring forward carry 245 br r31, $ret5f 246$fix6f: bis r22, r23, r22 C bring forward carry 247 br r31, $ret6f 248$fix0: bis cy1, r23, cy1 C bring forward carry 249 br r31, $ret0 250$fix1: bis cy0, cy1, cy0 C bring forward carry 251 br r31, $ret1 252$fix2: bis r22, cy0, r22 C bring forward carry 253 br r31, $ret2 254$fix3: bis r23, r22, r23 C bring forward carry 255 br r31, $ret3 256$fix4: bis cy1, r23, cy1 C bring forward carry 257 br r31, $ret4 258$fix5: bis cy1, cy0, cy0 C bring forward carry 259 br r31, $ret5 260$fix6: bis r22, cy0, r22 C bring forward carry 261 br r31, $ret6 262$fix7: bis r23, r22, r23 C bring forward carry 263 br r31, $ret7 264$fix0c: bis cy1, r23, cy1 C bring forward carry 265 br r31, $ret0c 266$fix1c: bis cy0, cy1, cy0 C bring forward carry 267 br r31, $ret1c 268$fix7c: bis r23, r22, r23 C bring forward carry 269 br r31, $ret7c 270 271EPILOGUE() 272ASM_END() 273