1dnl Alpha ev6 mpn_sub_n -- Subtract two limb vectors of the same length > 0 2dnl and store difference in a third limb vector. 3 4dnl Copyright 2000, 2003, 2005 Free Software Foundation, Inc. 5 6dnl This file is part of the GNU MP Library. 7 8dnl The GNU MP Library is free software; you can redistribute it and/or modify 9dnl it under the terms of the GNU Lesser General Public License as published 10dnl by the Free Software Foundation; either version 3 of the License, or (at 11dnl your option) any later version. 12 13dnl The GNU MP Library is distributed in the hope that it will be useful, but 14dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 15dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 16dnl License for more details. 17 18dnl You should have received a copy of the GNU Lesser General Public License 19dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. 20 21include(`../config.m4') 22 23C cycles/limb 24C EV4: ? 25C EV5: 5.4 26C EV6: 2.125 27 28C INPUT PARAMETERS 29C rp r16 30C up r17 31C vp r18 32C n r19 33C cy r20 (for mpn_add_nc) 34 35C TODO 36C Finish cleaning up cy registers r22, r23 (make them use cy0/cy1) 37C Use multi-pronged feed-in. 38C Perform additional micro-tuning 39 40C This code was written in cooperation with ev6 pipeline expert Steve Root. 41 42C Pair loads and stores where possible 43C Store pairs oct-aligned where possible (didn't need it here) 44C Stores are delayed every third cycle 45C Loads and stores are delayed by fills 46C U stays still, put code there where possible (note alternation of U1 and U0) 47C L moves because of loads and stores 48C Note dampers in L to limit damage 49 50C This odd-looking optimization expects that were having random bits in our 51C data, so that a pure zero result is unlikely. so we penalize the unlikely 52C case to help the common case. 53 54define(`u0', `r0') define(`u1', `r3') 55define(`v0', `r1') define(`v1', `r4') 56 57define(`cy0', `r20') define(`cy1', `r21') 58 59MULFUNC_PROLOGUE(mpn_sub_n mpn_sub_nc) 60 61ASM_START() 62PROLOGUE(mpn_sub_nc) 63 br r31, $entry 64EPILOGUE() 65PROLOGUE(mpn_sub_n) 66 bis r31, r31, cy0 C clear carry in 67$entry: cmpult r19, 5, r22 C L1 move counter 68 ldq u1, 0(r17) C L0 get next ones 69 ldq v1, 0(r18) C L1 70 bne r22, $Lsmall 71 72 ldq u0, 8(r17) C L0 get next ones 73 ldq v0, 8(r18) C L1 74 subq u1, v1, r5 C U0 sub two data 75 76 cmpult u1, v1, r23 C U0 did it borrow 77 ldq u1, 16(r17) C L0 get next ones 78 ldq v1, 16(r18) C L1 79 80 subq u0, v0, r8 C U1 sub two data 81 subq r5, cy0, r24 C U0 borrow in 82 83 cmpult u0, v0, r22 C U1 did it borrow 84 beq r5, $fix5f C U0 fix exact zero 85$ret5f: ldq u0, 24(r17) C L0 get next ones 86 ldq v0, 24(r18) C L1 87 88 subq r8, r23, r25 C U1 borrow from last 89 subq u1, v1, r7 C U0 sub two data 90 91 beq r8, $fix6f C U1 fix exact zero 92$ret6f: cmpult u1, v1, r23 C U0 did it borrow 93 ldq u1, 32(r17) C L0 get next ones 94 ldq v1, 32(r18) C L1 95 96 lda r17, 40(r17) C L0 move pointer 97 lda r18, 40(r18) C L1 move pointer 98 99 lda r16, -8(r16) 100 lda r19, -13(r19) C L1 move counter 101 blt r19, $Lend C U1 loop control 102 103 104C Main loop. 8-way unrolled. 105 ALIGN(16) 106$Loop: subq u0, v0, r2 C U1 sub two data 107 stq r24, 8(r16) C L0 put an answer 108 subq r7, r22, r24 C U0 borrow from last 109 stq r25, 16(r16) C L1 pair 110 111 cmpult u0, v0, cy1 C U1 did it borrow 112 beq r7, $fix7 C U0 fix exact 0 113$ret7: ldq u0, 0(r17) C L0 get next ones 114 ldq v0, 0(r18) C L1 115 116 bis r31, r31, r31 C L damp out 117 subq r2, r23, r25 C U1 borrow from last 118 bis r31, r31, r31 C L moves in L ! 119 subq u1, v1, r5 C U0 sub two data 120 121 beq r2, $fix0 C U1 fix exact zero 122$ret0: cmpult u1, v1, cy0 C U0 did it borrow 123 ldq u1, 8(r17) C L0 get next ones 124 ldq v1, 8(r18) C L1 125 126 subq u0, v0, r8 C U1 sub two data 127 stq r24, 24(r16) C L0 store pair 128 subq r5, cy1, r24 C U0 borrow from last 129 stq r25, 32(r16) C L1 130 131 cmpult u0, v0, r22 C U1 did it borrow 132 beq r5, $fix1 C U0 fix exact zero 133$ret1: ldq u0, 16(r17) C L0 get next ones 134 ldq v0, 16(r18) C L1 135 136 lda r16, 64(r16) C L0 move pointer 137 subq r8, cy0, r25 C U1 borrow from last 138 lda r19, -8(r19) C L1 move counter 139 subq u1, v1, r7 C U0 sub two data 140 141 beq r8, $fix2 C U1 fix exact zero 142$ret2: cmpult u1, v1, r23 C U0 did it borrow 143 ldq u1, 24(r17) C L0 get next ones 144 ldq v1, 24(r18) C L1 145 146 subq u0, v0, r2 C U1 sub two data 147 stq r24, -24(r16) C L0 put an answer 148 subq r7, r22, r24 C U0 borrow from last 149 stq r25, -16(r16) C L1 pair 150 151 cmpult u0, v0, cy1 C U1 did it borrow 152 beq r7, $fix3 C U0 fix exact 0 153$ret3: ldq u0, 32(r17) C L0 get next ones 154 ldq v0, 32(r18) C L1 155 156 bis r31, r31, r31 C L damp out 157 subq r2, r23, r25 C U1 borrow from last 158 bis r31, r31, r31 C L moves in L ! 159 subq u1, v1, r5 C U0 sub two data 160 161 beq r2, $fix4 C U1 fix exact zero 162$ret4: cmpult u1, v1, cy0 C U0 did it borrow 163 ldq u1, 40(r17) C L0 get next ones 164 ldq v1, 40(r18) C L1 165 166 subq u0, v0, r8 C U1 sub two data 167 stq r24, -8(r16) C L0 store pair 168 subq r5, cy1, r24 C U0 borrow from last 169 stq r25, 0(r16) C L1 170 171 cmpult u0, v0, r22 C U1 did it borrow 172 beq r5, $fix5 C U0 fix exact zero 173$ret5: ldq u0, 48(r17) C L0 get next ones 174 ldq v0, 48(r18) C L1 175 176 ldl r31, 256(r17) C L0 prefetch 177 subq r8, cy0, r25 C U1 borrow from last 178 ldl r31, 256(r18) C L1 prefetch 179 subq u1, v1, r7 C U0 sub two data 180 181 beq r8, $fix6 C U1 fix exact zero 182$ret6: cmpult u1, v1, r23 C U0 did it borrow 183 ldq u1, 56(r17) C L0 get next ones 184 ldq v1, 56(r18) C L1 185 186 lda r17, 64(r17) C L0 move pointer 187 bis r31, r31, r31 C U 188 lda r18, 64(r18) C L1 move pointer 189 bge r19, $Loop C U1 loop control 190C ==== main loop end 191 192$Lend: subq u0, v0, r2 C U1 sub two data 193 stq r24, 8(r16) C L0 put an answer 194 subq r7, r22, r24 C U0 borrow from last 195 stq r25, 16(r16) C L1 pair 196 cmpult u0, v0, cy1 C U1 did it borrow 197 beq r7, $fix7c C U0 fix exact 0 198$ret7c: subq r2, r23, r25 C U1 borrow from last 199 subq u1, v1, r5 C U0 sub two data 200 beq r2, $fix0c C U1 fix exact zero 201$ret0c: cmpult u1, v1, cy0 C U0 did it borrow 202 stq r24, 24(r16) C L0 store pair 203 subq r5, cy1, r24 C U0 borrow from last 204 stq r25, 32(r16) C L1 205 beq r5, $fix1c C U0 fix exact zero 206$ret1c: stq r24, 40(r16) C L0 put an answer 207 lda r16, 48(r16) C L0 move pointer 208 209 lda r19, 8(r19) 210 beq r19, $Lret 211 212 ldq u1, 0(r17) 213 ldq v1, 0(r18) 214$Lsmall: 215 lda r19, -1(r19) 216 beq r19, $Lend0 217 218 ALIGN(8) 219$Loop0: subq u1, v1, r2 C main sub 220 cmpult u1, v1, r8 C compute bw from last sub 221 ldq u1, 8(r17) 222 ldq v1, 8(r18) 223 subq r2, cy0, r5 C borrow sub 224 lda r17, 8(r17) 225 lda r18, 8(r18) 226 stq r5, 0(r16) 227 cmpult r2, cy0, cy0 C compute bw from last sub 228 lda r19, -1(r19) C decr loop cnt 229 bis r8, cy0, cy0 C combine bw from the two subs 230 lda r16, 8(r16) 231 bne r19, $Loop0 232$Lend0: subq u1, v1, r2 C main sub 233 subq r2, cy0, r5 C borrow sub 234 cmpult u1, v1, r8 C compute bw from last sub 235 cmpult r2, cy0, cy0 C compute bw from last sub 236 stq r5, 0(r16) 237 bis r8, cy0, r0 C combine bw from the two subs 238 ret r31,(r26),1 239 240 ALIGN(8) 241$Lret: lda r0, 0(cy0) C copy borrow into return register 242 ret r31,(r26),1 243 244$fix5f: bis r23, cy0, r23 C bring forward borrow 245 br r31, $ret5f 246$fix6f: bis r22, r23, r22 C bring forward borrow 247 br r31, $ret6f 248$fix0: bis cy1, r23, cy1 C bring forward borrow 249 br r31, $ret0 250$fix1: bis cy0, cy1, cy0 C bring forward borrow 251 br r31, $ret1 252$fix2: bis r22, cy0, r22 C bring forward borrow 253 br r31, $ret2 254$fix3: bis r23, r22, r23 C bring forward borrow 255 br r31, $ret3 256$fix4: bis cy1, r23, cy1 C bring forward borrow 257 br r31, $ret4 258$fix5: bis cy1, cy0, cy0 C bring forward borrow 259 br r31, $ret5 260$fix6: bis r22, cy0, r22 C bring forward borrow 261 br r31, $ret6 262$fix7: bis r23, r22, r23 C bring forward borrow 263 br r31, $ret7 264$fix0c: bis cy1, r23, cy1 C bring forward borrow 265 br r31, $ret0c 266$fix1c: bis cy0, cy1, cy0 C bring forward borrow 267 br r31, $ret1c 268$fix7c: bis r23, r22, r23 C bring forward borrow 269 br r31, $ret7c 270 271EPILOGUE() 272ASM_END() 273